In [1]:
#!pip install pandas==0.25
In [2]:
import pandas as pd
pd.__version__
Out[2]:
'1.1.5'
In [3]:
# Import modules
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import pandas_profiling as pp
import statsmodels.api as sm
from sklearn.model_selection import KFold,cross_val_score, train_test_split, GridSearchCV, learning_curve, validation_curve, RepeatedKFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, ElasticNet, Lasso, Ridge, BayesianRidge, LassoLarsIC
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA

from sklearn.feature_selection import RFE, RFECV, SelectKBest, f_regression

# Stats
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

# Figures inline and set visualization style
%matplotlib inline
sns.set()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [4]:
from google.colab import files
uploaded = files.upload()
import io

df_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving train.csv to train.csv
In [5]:
print(df_train.head(n=4))
df_train.shape
   Id  MSSubClass MSZoning  ...  SaleType  SaleCondition SalePrice
0   1          60       RL  ...        WD         Normal    208500
1   2          20       RL  ...        WD         Normal    181500
2   3          60       RL  ...        WD         Normal    223500
3   4          70       RL  ...        WD        Abnorml    140000

[4 rows x 81 columns]
Out[5]:
(1460, 81)
In [6]:
n_train = df_train.shape[0]
print(n_train)
1460
In [7]:
#pp.ProfileReport(df_train)
In [8]:
uploaded = files.upload()
df_test = pd.read_csv(io.BytesIO(uploaded['test.csv']))
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test.csv to test.csv
In [9]:
print(df_test.shape)
df_test.head(n=4)
(1459, 80)
Out[9]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.000 11622 Pave NaN Reg Lvl AllPub Inside Gtl NAmes Feedr Norm 1Fam 1Story 5 6 1961 1961 Gable CompShg VinylSd VinylSd None 0.000 TA TA CBlock TA TA No Rec 468.000 LwQ 144.000 270.000 882.000 GasA TA Y SBrkr 896 0 0 896 0.000 0.000 1 0 2 1 TA 5 Typ 0 NaN Attchd 1961.000 Unf 1.000 730.000 TA TA Y 140 0 0 0 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.000 14267 Pave NaN IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 6 1958 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace 108.000 TA TA CBlock TA TA No ALQ 923.000 Unf 0.000 406.000 1329.000 GasA TA Y SBrkr 1329 0 0 1329 0.000 0.000 1 1 3 1 Gd 6 Typ 0 NaN Attchd 1958.000 Unf 1.000 312.000 TA TA Y 393 36 0 0 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.000 13830 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 5 5 1997 1998 Gable CompShg VinylSd VinylSd None 0.000 TA TA PConc Gd TA No GLQ 791.000 Unf 0.000 137.000 928.000 GasA Gd Y SBrkr 928 701 0 1629 0.000 0.000 2 1 3 1 TA 6 Typ 1 TA Attchd 1997.000 Fin 2.000 482.000 TA TA Y 212 34 0 0 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
3 1464 60 RL 78.000 9978 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 6 6 1998 1998 Gable CompShg VinylSd VinylSd BrkFace 20.000 TA TA PConc TA TA No GLQ 602.000 Unf 0.000 324.000 926.000 GasA Ex Y SBrkr 926 678 0 1604 0.000 0.000 2 1 3 1 Gd 7 Typ 1 Gd Attchd 1998.000 Fin 2.000 470.000 TA TA Y 360 36 0 0 0 0 NaN NaN NaN 0 6 2010 WD Normal
In [10]:
SalePrice_train = df_train.SalePrice
data = pd.concat([df_train.drop(['SalePrice'], axis=1), df_test])
In [11]:
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [12]:
df_train.SalePrice.describe()
Out[12]:
count     1460.000
mean    180921.196
std      79442.503
min      34900.000
25%     129975.000
50%     163000.000
75%     214000.000
max     755000.000
Name: SalePrice, dtype: float64
In [13]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.histplot(df_train['SalePrice'],kde=True)
plt.title("Histogram for SalePrice")
# Skew and kurt
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
Skewness: 1.882876
Kurtosis: 6.536282

Figure 1.1: distribution of the dependent variable sale prices

In [14]:
df_train.SalePrice.plot.box()
plt.tight_layout(pad=0.5)

Figure 1.2: box plot of the dependent variable sale prices

In [15]:
stats.probplot(df_train.SalePrice, plot=sns.mpl.pyplot)
Out[15]:
((array([-3.30513952, -3.04793228, -2.90489705, ...,  2.90489705,
          3.04793228,  3.30513952]),
  array([ 34900,  35311,  37900, ..., 625000, 745000, 755000])),
 (74160.16474519414, 180921.19589041095, 0.9319665641512983))

Figure 1.3: Q-Q plot of the dependent variable sale prices

In [16]:
df_train.describe()
Out[16]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
count 1460.000 1460.000 1201.000 1460.000 1460.000 1460.000 1460.000 1460.000 1452.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1379.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000 1460.000
mean 730.500 56.897 70.050 10516.828 6.099 5.575 1971.268 1984.866 103.685 443.640 46.549 567.240 1057.429 1162.627 346.992 5.845 1515.464 0.425 0.058 1.565 0.383 2.866 1.047 6.518 0.613 1978.506 1.767 472.980 94.245 46.660 21.954 3.410 15.061 2.759 43.489 6.322 2007.816 180921.196
std 421.610 42.301 24.285 9981.265 1.383 1.113 30.203 20.645 181.066 456.098 161.319 441.867 438.705 386.588 436.528 48.623 525.480 0.519 0.239 0.551 0.503 0.816 0.220 1.625 0.645 24.690 0.747 213.805 125.339 66.256 61.119 29.317 55.757 40.177 496.123 2.704 1.328 79442.503
min 1.000 20.000 21.000 1300.000 1.000 1.000 1872.000 1950.000 0.000 0.000 0.000 0.000 0.000 334.000 0.000 0.000 334.000 0.000 0.000 0.000 0.000 0.000 0.000 2.000 0.000 1900.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 1.000 2006.000 34900.000
25% 365.750 20.000 59.000 7553.500 5.000 5.000 1954.000 1967.000 0.000 0.000 0.000 223.000 795.750 882.000 0.000 0.000 1129.500 0.000 0.000 1.000 0.000 2.000 1.000 5.000 0.000 1961.000 1.000 334.500 0.000 0.000 0.000 0.000 0.000 0.000 0.000 5.000 2007.000 129975.000
50% 730.500 50.000 69.000 9478.500 6.000 5.000 1973.000 1994.000 0.000 383.500 0.000 477.500 991.500 1087.000 0.000 0.000 1464.000 0.000 0.000 2.000 0.000 3.000 1.000 6.000 1.000 1980.000 2.000 480.000 0.000 25.000 0.000 0.000 0.000 0.000 0.000 6.000 2008.000 163000.000
75% 1095.250 70.000 80.000 11601.500 7.000 6.000 2000.000 2004.000 166.000 712.250 0.000 808.000 1298.250 1391.250 728.000 0.000 1776.750 1.000 0.000 2.000 1.000 3.000 1.000 7.000 1.000 2002.000 2.000 576.000 168.000 68.000 0.000 0.000 0.000 0.000 0.000 8.000 2009.000 214000.000
max 1460.000 190.000 313.000 215245.000 10.000 9.000 2010.000 2010.000 1600.000 5644.000 1474.000 2336.000 6110.000 4692.000 2065.000 572.000 5642.000 3.000 2.000 3.000 2.000 8.000 3.000 14.000 3.000 2010.000 4.000 1418.000 857.000 547.000 552.000 508.000 480.000 738.000 15500.000 12.000 2010.000 755000.000
In [17]:
data_percent_missing = data.isnull().sum() * 100 / len(data)
data_missing_value = pd.DataFrame({'column_name': data.columns,'percent_missing': data_percent_missing})
data_missing_value = data_missing_value.sort_values('percent_missing',ascending=False)
data_missing_value['dtypes'] = data.dtypes
data_missing_value = data_missing_value[data_missing_value.percent_missing>0]
print(data_missing_value)
               column_name  percent_missing   dtypes
PoolQC              PoolQC           99.657   object
MiscFeature    MiscFeature           96.403   object
Alley                Alley           93.217   object
Fence                Fence           80.439   object
FireplaceQu    FireplaceQu           48.647   object
LotFrontage    LotFrontage           16.650  float64
GarageYrBlt    GarageYrBlt            5.447  float64
GarageFinish  GarageFinish            5.447   object
GarageQual      GarageQual            5.447   object
GarageCond      GarageCond            5.447   object
GarageType      GarageType            5.379   object
BsmtExposure  BsmtExposure            2.809   object
BsmtCond          BsmtCond            2.809   object
BsmtQual          BsmtQual            2.775   object
BsmtFinType2  BsmtFinType2            2.741   object
BsmtFinType1  BsmtFinType1            2.706   object
MasVnrType      MasVnrType            0.822   object
MasVnrArea      MasVnrArea            0.788  float64
MSZoning          MSZoning            0.137   object
Functional      Functional            0.069   object
BsmtHalfBath  BsmtHalfBath            0.069  float64
BsmtFullBath  BsmtFullBath            0.069  float64
Utilities        Utilities            0.069   object
GarageCars      GarageCars            0.034  float64
KitchenQual    KitchenQual            0.034   object
BsmtFinSF1      BsmtFinSF1            0.034  float64
SaleType          SaleType            0.034   object
BsmtFinSF2      BsmtFinSF2            0.034  float64
BsmtUnfSF        BsmtUnfSF            0.034  float64
TotalBsmtSF    TotalBsmtSF            0.034  float64
Exterior2nd    Exterior2nd            0.034   object
Exterior1st    Exterior1st            0.034   object
GarageArea      GarageArea            0.034  float64
Electrical      Electrical            0.034   object
In [18]:
data_missing_object = data_missing_value[data_missing_value['dtypes']=='object']['column_name'].tolist()
data_missing_num = data_missing_value[data_missing_value['dtypes']!='object']['column_name'].tolist()
print(data_missing_object)
print(data_missing_num)
['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MSZoning', 'Functional', 'Utilities', 'KitchenQual', 'SaleType', 'Exterior2nd', 'Exterior1st', 'Electrical']
['LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']

Figure 4.2: imputation for missing data

In [19]:
for col in data_missing_object:
        data[col] = data[col].fillna('_NA_');
for col in data_missing_num:
        data[col] = data[col].fillna(0);
#for col in ['GarageYrBlt']:
#       data[col] = data.groupby('Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
In [20]:
data_percent_missing = data.isnull().sum() * 100 / len(data)
data_missing_value = pd.DataFrame({'column_name': data.columns,'percent_missing': data_percent_missing})
data_missing_value = data_missing_value.sort_values('percent_missing',ascending=False)
data_missing_value['dtypes'] = data.dtypes
data_missing_value[data_missing_value.percent_missing>0]
Out[20]:
column_name percent_missing dtypes
In [21]:
data.head()
Out[21]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.000 8450 Pave _NA_ Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.000 Gd TA PConc Gd TA No GLQ 706.000 Unf 0.000 150.000 856.000 GasA Ex Y SBrkr 856 854 0 1710 1.000 0.000 2 1 3 1 Gd 8 Typ 0 _NA_ Attchd 2003.000 RFn 2.000 548.000 TA TA Y 0 61 0 0 0 0 _NA_ _NA_ _NA_ 0 2 2008 WD Normal
1 2 20 RL 80.000 9600 Pave _NA_ Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.000 TA TA CBlock Gd TA Gd ALQ 978.000 Unf 0.000 284.000 1262.000 GasA Ex Y SBrkr 1262 0 0 1262 0.000 1.000 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.000 RFn 2.000 460.000 TA TA Y 298 0 0 0 0 0 _NA_ _NA_ _NA_ 0 5 2007 WD Normal
2 3 60 RL 68.000 11250 Pave _NA_ IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.000 Gd TA PConc Gd TA Mn GLQ 486.000 Unf 0.000 434.000 920.000 GasA Ex Y SBrkr 920 866 0 1786 1.000 0.000 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.000 RFn 2.000 608.000 TA TA Y 0 42 0 0 0 0 _NA_ _NA_ _NA_ 0 9 2008 WD Normal
3 4 70 RL 60.000 9550 Pave _NA_ IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.000 TA TA BrkTil TA Gd No ALQ 216.000 Unf 0.000 540.000 756.000 GasA Gd Y SBrkr 961 756 0 1717 1.000 0.000 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.000 Unf 3.000 642.000 TA TA Y 0 35 272 0 0 0 _NA_ _NA_ _NA_ 0 2 2006 WD Abnorml
4 5 60 RL 84.000 14260 Pave _NA_ IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.000 Gd TA PConc Gd TA Av GLQ 655.000 Unf 0.000 490.000 1145.000 GasA Ex Y SBrkr 1145 1053 0 2198 1.000 0.000 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.000 RFn 3.000 836.000 TA TA Y 192 84 0 0 0 0 _NA_ _NA_ _NA_ 0 12 2008 WD Normal

Figure 4.1: feature creation

In [22]:
data['TotalSF'] = data['1stFlrSF']  + data['2ndFlrSF'] + data['TotalBsmtSF']
data['TotalPorchSF'] = data['OpenPorchSF']+data['EnclosedPorch']+data['3SsnPorch']+data['ScreenPorch']+data['WoodDeckSF']
data['HouseAge'] = data.YrSold - data.YearBuilt
data['QualityIndex'] = data.OverallQual * data.OverallCond
data['Total_Bathrooms'] = data.BsmtFullBath + .5*data.BsmtHalfBath + data.FullBath + .5*data.HalfBath
data['Has_Fireplaces'] = np.where(data['Fireplaces']>=1, 1, 0)
data['Has_Bsmt'] = np.where(data['TotalBsmtSF']>=0, 1, 0)
data['Has_Garage'] = np.where(data['GarageArea']>=0, 1, 0)
data['Has_Pool'] = np.where(data['PoolArea']>=0, 1, 0)
data['Has_2ndStory'] = np.where(data['2ndFlrSF']>=0, 1, 0)
data.head()
Out[22]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating ... HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition TotalSF TotalPorchSF HouseAge QualityIndex Total_Bathrooms Has_Fireplaces Has_Bsmt Has_Garage Has_Pool Has_2ndStory
0 1 60 RL 65.000 8450 Pave _NA_ Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.000 Gd TA PConc Gd TA No GLQ 706.000 Unf 0.000 150.000 856.000 GasA ... 1 3 1 Gd 8 Typ 0 _NA_ Attchd 2003.000 RFn 2.000 548.000 TA TA Y 0 61 0 0 0 0 _NA_ _NA_ _NA_ 0 2 2008 WD Normal 2566.000 61 5 35 3.500 0 1 1 1 1
1 2 20 RL 80.000 9600 Pave _NA_ Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.000 TA TA CBlock Gd TA Gd ALQ 978.000 Unf 0.000 284.000 1262.000 GasA ... 0 3 1 TA 6 Typ 1 TA Attchd 1976.000 RFn 2.000 460.000 TA TA Y 298 0 0 0 0 0 _NA_ _NA_ _NA_ 0 5 2007 WD Normal 2524.000 298 31 48 2.500 1 1 1 1 1
2 3 60 RL 68.000 11250 Pave _NA_ IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.000 Gd TA PConc Gd TA Mn GLQ 486.000 Unf 0.000 434.000 920.000 GasA ... 1 3 1 Gd 6 Typ 1 TA Attchd 2001.000 RFn 2.000 608.000 TA TA Y 0 42 0 0 0 0 _NA_ _NA_ _NA_ 0 9 2008 WD Normal 2706.000 42 7 35 3.500 1 1 1 1 1
3 4 70 RL 60.000 9550 Pave _NA_ IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.000 TA TA BrkTil TA Gd No ALQ 216.000 Unf 0.000 540.000 756.000 GasA ... 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.000 Unf 3.000 642.000 TA TA Y 0 35 272 0 0 0 _NA_ _NA_ _NA_ 0 2 2006 WD Abnorml 2473.000 307 91 35 2.000 1 1 1 1 1
4 5 60 RL 84.000 14260 Pave _NA_ IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.000 Gd TA PConc Gd TA Av GLQ 655.000 Unf 0.000 490.000 1145.000 GasA ... 1 4 1 Gd 9 Typ 1 TA Attchd 2000.000 RFn 3.000 836.000 TA TA Y 192 84 0 0 0 0 _NA_ _NA_ _NA_ 0 12 2008 WD Normal 3343.000 276 8 40 3.500 1 1 1 1 1

5 rows × 90 columns

Figure 2.1: list of columns of the percentage of missing values

In [23]:
df_train_num =  df_train.select_dtypes(include=np.number)
df_train_num_predictors = df_train_num.drop(['SalePrice'], axis=1)
print(df_train_num_predictors.shape)
df_train_num_predictors.hist(bins=10, figsize=(20, 25), layout=(8, 5));
(1460, 37)
In [24]:
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 150))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train_num_predictors), 1):
    if(feature=='MiscVal'):
        break
    plt.subplot(len(list(df_train_num_predictors)), 3, i)
    sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=df_train)
        
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('SalePrice', size=15, labelpad=12.5)
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(loc='best', prop={'size': 10})
        
plt.show()

Figure 3.1: scatter plots of continuous variables versus the sale price

In [25]:
# Outliers
print(df_train[(df_train.GrLivArea>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train['1stFlrSF']>4500) & (df_train.SalePrice<200000)])
print(df_train[(df_train.LotFrontage>300)])
        Id  MSSubClass MSZoning  ...  SaleType  SaleCondition SalePrice
523    524          60       RL  ...       New        Partial    184750
1298  1299          60       RL  ...       New        Partial    160000

[2 rows x 81 columns]
        Id  MSSubClass MSZoning  ...  SaleType  SaleCondition SalePrice
1298  1299          60       RL  ...       New        Partial    160000

[1 rows x 81 columns]
        Id  MSSubClass MSZoning  ...  SaleType  SaleCondition SalePrice
934    935          20       RL  ...        WD         Normal    242000
1298  1299          60       RL  ...       New        Partial    160000

[2 rows x 81 columns]

Figure 2.2: list of outliers

In [26]:
df_train_categorical =  df_train.select_dtypes(exclude=np.number)
print("Categorical:", df_train_categorical.shape)
df_train_num =  df_train.select_dtypes(include=np.number)
df_train_num = df_train_num.drop(['Id'], axis=1)
print("Numerical:", df_train_num.shape)
plt.subplots(figsize=(38, 38))
sns.heatmap(df_train_num.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')
Categorical: (1460, 43)
Numerical: (1460, 37)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13619f6dd0>

Figure 3.2: correlation matrix

In [27]:
# visualising some more outliers in the data values
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
    sns.countplot(x=var, data=df_train, ax=subplot)

Figure 3.3: bar plot of categorical variables

In [28]:
fig, ax = plt.subplots(15, 3, figsize=(20, 100))
for var, subplot in zip(df_train_categorical, ax.flatten()):
    sns.boxplot(x=var, y='SalePrice', data=df_train, ax=subplot)

Figure 3.4: box plot of categorical variables

In [29]:
fig = plt.figure(figsize = (25,60))
sns.countplot(x='Neighborhood', data=df_train, ax=fig.add_subplot(6,1,1));
sns.boxplot(x='Neighborhood', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,2));

sns.countplot(x='Exterior1st', data=df_train, ax=fig.add_subplot(6,1,3));
sns.boxplot(x='Exterior1st', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,4));

sns.countplot(x='Exterior2nd', data=df_train, ax=fig.add_subplot(6,1,5));
sns.boxplot(x='Exterior2nd', y='SalePrice', data=df_train, ax=fig.add_subplot(6,1,6));
In [30]:
print(list(data.columns))
['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'TotalSF', 'TotalPorchSF', 'HouseAge', 'QualityIndex', 'Total_Bathrooms', 'Has_Fireplaces', 'Has_Bsmt', 'Has_Garage', 'Has_Pool', 'Has_2ndStory']

Figure 4.3: encode categorical variables

In [31]:
#data = pd.get_dummies(data, columns=list(df_train_categorical.columns), drop_first=True)
data.head()
Out[31]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating ... HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition TotalSF TotalPorchSF HouseAge QualityIndex Total_Bathrooms Has_Fireplaces Has_Bsmt Has_Garage Has_Pool Has_2ndStory
0 1 60 RL 65.000 8450 Pave _NA_ Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.000 Gd TA PConc Gd TA No GLQ 706.000 Unf 0.000 150.000 856.000 GasA ... 1 3 1 Gd 8 Typ 0 _NA_ Attchd 2003.000 RFn 2.000 548.000 TA TA Y 0 61 0 0 0 0 _NA_ _NA_ _NA_ 0 2 2008 WD Normal 2566.000 61 5 35 3.500 0 1 1 1 1
1 2 20 RL 80.000 9600 Pave _NA_ Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.000 TA TA CBlock Gd TA Gd ALQ 978.000 Unf 0.000 284.000 1262.000 GasA ... 0 3 1 TA 6 Typ 1 TA Attchd 1976.000 RFn 2.000 460.000 TA TA Y 298 0 0 0 0 0 _NA_ _NA_ _NA_ 0 5 2007 WD Normal 2524.000 298 31 48 2.500 1 1 1 1 1
2 3 60 RL 68.000 11250 Pave _NA_ IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.000 Gd TA PConc Gd TA Mn GLQ 486.000 Unf 0.000 434.000 920.000 GasA ... 1 3 1 Gd 6 Typ 1 TA Attchd 2001.000 RFn 2.000 608.000 TA TA Y 0 42 0 0 0 0 _NA_ _NA_ _NA_ 0 9 2008 WD Normal 2706.000 42 7 35 3.500 1 1 1 1 1
3 4 70 RL 60.000 9550 Pave _NA_ IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.000 TA TA BrkTil TA Gd No ALQ 216.000 Unf 0.000 540.000 756.000 GasA ... 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.000 Unf 3.000 642.000 TA TA Y 0 35 272 0 0 0 _NA_ _NA_ _NA_ 0 2 2006 WD Abnorml 2473.000 307 91 35 2.000 1 1 1 1 1
4 5 60 RL 84.000 14260 Pave _NA_ IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.000 Gd TA PConc Gd TA Av GLQ 655.000 Unf 0.000 490.000 1145.000 GasA ... 1 4 1 Gd 9 Typ 1 TA Attchd 2000.000 RFn 3.000 836.000 TA TA Y 192 84 0 0 0 0 _NA_ _NA_ _NA_ 0 12 2008 WD Normal 3343.000 276 8 40 3.500 1 1 1 1 1

5 rows × 90 columns

In [32]:
# Fetch all numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in data.columns:
    if data[i].dtype in numeric_dtypes:
        numeric.append(i)

# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=data[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
In [33]:
# Find skewed numerical features
skew_features = data[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)
There are 28 numerical features with Skew > 0.5 :
Out[33]:
MiscVal         21.947
PoolArea        16.898
LotArea         12.822
LowQualFinSF    12.089
3SsnPorch       11.376
KitchenAbvGr     4.302
BsmtFinSF2       4.146
EnclosedPorch    4.004
ScreenPorch      3.947
BsmtHalfBath     3.932
dtype: float64

Figure 6.5: Box Cox to skewed features

In [34]:
# Normalize skewed features
for i in skew_index:
  print(i)
  data[i] = data[i]+1
  data[i] = boxcox1p(data[i], boxcox_normmax(data[i]+1))
MiscVal
PoolArea
LotArea
LowQualFinSF
3SsnPorch
KitchenAbvGr
BsmtFinSF2
EnclosedPorch
ScreenPorch
BsmtHalfBath
MasVnrArea
OpenPorchSF
WoodDeckSF
TotalSF
1stFlrSF
BsmtFinSF1
TotalPorchSF
MSSubClass
GrLivArea
TotalBsmtSF
BsmtUnfSF
2ndFlrSF
TotRmsAbvGrd
Fireplaces
HalfBath
BsmtFullBath
HouseAge
OverallCond
In [35]:
# Let's make sure we handled all the skewed values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=data[skew_index] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
In [36]:
def logs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)   
        res.columns.values[m] = l + '_log'
        m += 1
    return res

log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF','HouseAge']

data = logs(data, log_features)

Figure 6.6: Log transformation

In [37]:
def squares(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(res[l]*res[l]).values)   
        res.columns.values[m] = l + '_sq'
        m += 1
    return res 

squared_features = ['YearRemodAdd', 'LotFrontage_log', 
              'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
              'GarageCars_log', 'GarageArea_log','TotalSF_log','HouseAge_log']
data = squares(data, squared_features)

Figure 6.7: Square of Log transformation

In [38]:
data= pd.get_dummies(data).reset_index(drop=True)
print(data.shape)
data = data.loc[:,~data.columns.duplicated()]
print(data.shape)
(2919, 363)
(2919, 363)

Figure 6.8: Converts categorical data into dummy or indicator variables

In [39]:
df_train = data.iloc[:n_train]
df_test = data.iloc[n_train:]
print(df_train.shape)
print(df_test.shape)
df_train['SalePrice'] = SalePrice_train
df_train = df_train[~df_train.Id.isin([1299,524,935])]
print(df_train.shape)
df_train.head()
(1460, 363)
(1459, 363)
(1457, 364)
Out[39]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold TotalSF TotalPorchSF HouseAge ... GarageCond_Ex GarageCond_Fa GarageCond_Gd GarageCond_Po GarageCond_TA GarageCond__NA_ PavedDrive_N PavedDrive_P PavedDrive_Y PoolQC_Ex PoolQC_Fa PoolQC_Gd PoolQC__NA_ Fence_GdPrv Fence_GdWo Fence_MnPrv Fence_MnWw Fence__NA_ MiscFeature_Gar2 MiscFeature_Othr MiscFeature_Shed MiscFeature_TenC MiscFeature__NA_ SaleType_COD SaleType_CWD SaleType_Con SaleType_ConLD SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleType__NA_ SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial SalePrice
0 1 6.329 65.000 13.417 7 4.386 2003 2003 18.267 116.344 0.726 29.495 293.991 5.302 1022.665 0.660 7.214 1.958 0.597 2 2.382 3 0.992 2.001 0.854 2003.000 2.000 548.000 0.867 11.192 0.769 0.697 0.819 0.676 0.691 2 2008 22.926 18.418 3.523 ... 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 208500
1 2 4.237 80.000 13.687 6 6.132 1976 1976 0.799 144.762 0.726 43.181 402.988 5.533 1.012 0.660 6.929 0.987 0.871 2 1.109 3 0.992 1.832 1.538 1976.000 2.000 460.000 54.225 0.805 0.769 0.697 0.819 0.676 0.691 5 2007 22.820 49.659 10.863 ... 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 181500
2 3 6.329 68.000 14.026 7 4.386 2001 2002 16.758 90.526 0.726 55.501 311.729 5.346 1037.464 0.660 7.255 1.958 0.597 2 2.382 3 0.992 1.832 1.538 2001.000 2.000 608.000 0.867 9.292 0.769 0.697 0.819 0.676 0.691 9 2008 23.270 14.520 4.325 ... 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 223500
3 4 6.667 60.000 13.675 7 4.386 1915 1970 0.799 52.370 0.726 63.129 265.762 5.372 902.062 0.660 7.218 1.958 0.597 1 1.109 3 0.992 1.922 1.538 1998.000 3.000 642.000 0.867 8.472 14.294 0.697 0.819 0.676 0.691 2 2006 22.689 50.579 20.806 ... 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 140000
4 5 6.329 84.000 14.541 8 4.386 2000 2000 23.649 110.632 0.726 59.618 372.366 5.476 1268.841 0.660 7.449 1.958 0.597 2 2.382 4 0.992 2.072 1.538 2000.000 3.000 836.000 40.966 13.088 0.769 0.697 0.819 0.676 0.691 12 2008 24.682 47.362 4.697 ... 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 250000

5 rows × 364 columns

Figure 5.1: perform both min-max and standard scaling on the dependent variable

In [40]:
# log(1+x) transform
df_train["Log1p_SalePrice"] = np.log1p(df_train['SalePrice'] )

# define standard scaler
scaler = StandardScaler()
df_train["StandardScal_SalePrice"] = scaler.fit_transform(df_train[['SalePrice']] )

# define max-min scaler
scaler = MinMaxScaler()
df_train["MaxMinScal_SalePrice"] = scaler.fit_transform(df_train[['SalePrice']] )
df_train.head()
Out[40]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold TotalSF TotalPorchSF HouseAge ... GarageCond_Po GarageCond_TA GarageCond__NA_ PavedDrive_N PavedDrive_P PavedDrive_Y PoolQC_Ex PoolQC_Fa PoolQC_Gd PoolQC__NA_ Fence_GdPrv Fence_GdWo Fence_MnPrv Fence_MnWw Fence__NA_ MiscFeature_Gar2 MiscFeature_Othr MiscFeature_Shed MiscFeature_TenC MiscFeature__NA_ SaleType_COD SaleType_CWD SaleType_Con SaleType_ConLD SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleType__NA_ SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial SalePrice Log1p_SalePrice StandardScal_SalePrice MaxMinScal_SalePrice
0 1 6.329 65.000 13.417 7 4.386 2003 2003 18.267 116.344 0.726 29.495 293.991 5.302 1022.665 0.660 7.214 1.958 0.597 2 2.382 3 0.992 2.001 0.854 2003.000 2.000 548.000 0.867 11.192 0.769 0.697 0.819 0.676 0.691 2 2008 22.926 18.418 3.523 ... 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 208500 12.248 0.347 0.241
1 2 4.237 80.000 13.687 6 6.132 1976 1976 0.799 144.762 0.726 43.181 402.988 5.533 1.012 0.660 6.929 0.987 0.871 2 1.109 3 0.992 1.832 1.538 1976.000 2.000 460.000 54.225 0.805 0.769 0.697 0.819 0.676 0.691 5 2007 22.820 49.659 10.863 ... 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 181500 12.109 0.008 0.204
2 3 6.329 68.000 14.026 7 4.386 2001 2002 16.758 90.526 0.726 55.501 311.729 5.346 1037.464 0.660 7.255 1.958 0.597 2 2.382 3 0.992 1.832 1.538 2001.000 2.000 608.000 0.867 9.292 0.769 0.697 0.819 0.676 0.691 9 2008 23.270 14.520 4.325 ... 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 223500 12.317 0.536 0.262
3 4 6.667 60.000 13.675 7 4.386 1915 1970 0.799 52.370 0.726 63.129 265.762 5.372 902.062 0.660 7.218 1.958 0.597 1 1.109 3 0.992 1.922 1.538 1998.000 3.000 642.000 0.867 8.472 14.294 0.697 0.819 0.676 0.691 2 2006 22.689 50.579 20.806 ... 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 140000 11.849 -0.514 0.146
4 5 6.329 84.000 14.541 8 4.386 2000 2000 23.649 110.632 0.726 59.618 372.366 5.476 1268.841 0.660 7.449 1.958 0.597 2 2.382 4 0.992 2.072 1.538 2000.000 3.000 836.000 40.966 13.088 0.769 0.697 0.819 0.676 0.691 12 2008 24.682 47.362 4.697 ... 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 250000 12.429 0.870 0.299

5 rows × 367 columns

In [41]:
df_train.dtypes
Out[41]:
Id                          int64
MSSubClass                float64
LotFrontage               float64
LotArea                   float64
OverallQual                 int64
OverallCond               float64
YearBuilt                   int64
YearRemodAdd                int64
MasVnrArea                float64
BsmtFinSF1                float64
BsmtFinSF2                float64
BsmtUnfSF                 float64
TotalBsmtSF               float64
1stFlrSF                  float64
2ndFlrSF                  float64
LowQualFinSF              float64
GrLivArea                 float64
BsmtFullBath              float64
BsmtHalfBath              float64
FullBath                    int64
HalfBath                  float64
BedroomAbvGr                int64
KitchenAbvGr              float64
TotRmsAbvGrd              float64
Fireplaces                float64
GarageYrBlt               float64
GarageCars                float64
GarageArea                float64
WoodDeckSF                float64
OpenPorchSF               float64
EnclosedPorch             float64
3SsnPorch                 float64
ScreenPorch               float64
PoolArea                  float64
MiscVal                   float64
MoSold                      int64
YrSold                      int64
TotalSF                   float64
TotalPorchSF              float64
HouseAge                  float64
QualityIndex                int64
Total_Bathrooms           float64
Has_Fireplaces              int64
Has_Bsmt                    int64
Has_Garage                  int64
Has_Pool                    int64
Has_2ndStory                int64
LotFrontage_log           float64
LotArea_log               float64
MasVnrArea_log            float64
BsmtFinSF1_log            float64
BsmtFinSF2_log            float64
BsmtUnfSF_log             float64
TotalBsmtSF_log           float64
1stFlrSF_log              float64
2ndFlrSF_log              float64
LowQualFinSF_log          float64
GrLivArea_log             float64
BsmtFullBath_log          float64
BsmtHalfBath_log          float64
FullBath_log              float64
HalfBath_log              float64
BedroomAbvGr_log          float64
KitchenAbvGr_log          float64
TotRmsAbvGrd_log          float64
Fireplaces_log            float64
GarageCars_log            float64
GarageArea_log            float64
WoodDeckSF_log            float64
OpenPorchSF_log           float64
EnclosedPorch_log         float64
3SsnPorch_log             float64
ScreenPorch_log           float64
PoolArea_log              float64
MiscVal_log               float64
YearRemodAdd_log          float64
TotalSF_log               float64
HouseAge_log              float64
YearRemodAdd_sq             int64
LotFrontage_log_sq        float64
TotalBsmtSF_log_sq        float64
1stFlrSF_log_sq           float64
2ndFlrSF_log_sq           float64
GrLivArea_log_sq          float64
GarageCars_log_sq         float64
GarageArea_log_sq         float64
TotalSF_log_sq            float64
HouseAge_log_sq           float64
MSZoning_C (all)            uint8
MSZoning_FV                 uint8
MSZoning_RH                 uint8
MSZoning_RL                 uint8
MSZoning_RM                 uint8
MSZoning__NA_               uint8
Street_Grvl                 uint8
Street_Pave                 uint8
Alley_Grvl                  uint8
Alley_Pave                  uint8
Alley__NA_                  uint8
LotShape_IR1                uint8
LotShape_IR2                uint8
LotShape_IR3                uint8
LotShape_Reg                uint8
LandContour_Bnk             uint8
LandContour_HLS             uint8
LandContour_Low             uint8
LandContour_Lvl             uint8
Utilities_AllPub            uint8
Utilities_NoSeWa            uint8
Utilities__NA_              uint8
LotConfig_Corner            uint8
LotConfig_CulDSac           uint8
LotConfig_FR2               uint8
LotConfig_FR3               uint8
LotConfig_Inside            uint8
LandSlope_Gtl               uint8
LandSlope_Mod               uint8
LandSlope_Sev               uint8
Neighborhood_Blmngtn        uint8
Neighborhood_Blueste        uint8
Neighborhood_BrDale         uint8
Neighborhood_BrkSide        uint8
Neighborhood_ClearCr        uint8
Neighborhood_CollgCr        uint8
Neighborhood_Crawfor        uint8
Neighborhood_Edwards        uint8
Neighborhood_Gilbert        uint8
Neighborhood_IDOTRR         uint8
Neighborhood_MeadowV        uint8
Neighborhood_Mitchel        uint8
Neighborhood_NAmes          uint8
Neighborhood_NPkVill        uint8
Neighborhood_NWAmes         uint8
Neighborhood_NoRidge        uint8
Neighborhood_NridgHt        uint8
Neighborhood_OldTown        uint8
Neighborhood_SWISU          uint8
Neighborhood_Sawyer         uint8
Neighborhood_SawyerW        uint8
Neighborhood_Somerst        uint8
Neighborhood_StoneBr        uint8
Neighborhood_Timber         uint8
Neighborhood_Veenker        uint8
Condition1_Artery           uint8
Condition1_Feedr            uint8
Condition1_Norm             uint8
Condition1_PosA             uint8
Condition1_PosN             uint8
Condition1_RRAe             uint8
Condition1_RRAn             uint8
Condition1_RRNe             uint8
Condition1_RRNn             uint8
Condition2_Artery           uint8
Condition2_Feedr            uint8
Condition2_Norm             uint8
Condition2_PosA             uint8
Condition2_PosN             uint8
Condition2_RRAe             uint8
Condition2_RRAn             uint8
Condition2_RRNn             uint8
BldgType_1Fam               uint8
BldgType_2fmCon             uint8
BldgType_Duplex             uint8
BldgType_Twnhs              uint8
BldgType_TwnhsE             uint8
HouseStyle_1.5Fin           uint8
HouseStyle_1.5Unf           uint8
HouseStyle_1Story           uint8
HouseStyle_2.5Fin           uint8
HouseStyle_2.5Unf           uint8
HouseStyle_2Story           uint8
HouseStyle_SFoyer           uint8
HouseStyle_SLvl             uint8
RoofStyle_Flat              uint8
RoofStyle_Gable             uint8
RoofStyle_Gambrel           uint8
RoofStyle_Hip               uint8
RoofStyle_Mansard           uint8
RoofStyle_Shed              uint8
RoofMatl_ClyTile            uint8
RoofMatl_CompShg            uint8
RoofMatl_Membran            uint8
RoofMatl_Metal              uint8
RoofMatl_Roll               uint8
RoofMatl_Tar&Grv            uint8
RoofMatl_WdShake            uint8
RoofMatl_WdShngl            uint8
Exterior1st_AsbShng         uint8
Exterior1st_AsphShn         uint8
Exterior1st_BrkComm         uint8
Exterior1st_BrkFace         uint8
Exterior1st_CBlock          uint8
Exterior1st_CemntBd         uint8
Exterior1st_HdBoard         uint8
Exterior1st_ImStucc         uint8
Exterior1st_MetalSd         uint8
Exterior1st_Plywood         uint8
Exterior1st_Stone           uint8
Exterior1st_Stucco          uint8
Exterior1st_VinylSd         uint8
Exterior1st_Wd Sdng         uint8
Exterior1st_WdShing         uint8
Exterior1st__NA_            uint8
Exterior2nd_AsbShng         uint8
Exterior2nd_AsphShn         uint8
Exterior2nd_Brk Cmn         uint8
Exterior2nd_BrkFace         uint8
Exterior2nd_CBlock          uint8
Exterior2nd_CmentBd         uint8
Exterior2nd_HdBoard         uint8
Exterior2nd_ImStucc         uint8
Exterior2nd_MetalSd         uint8
Exterior2nd_Other           uint8
Exterior2nd_Plywood         uint8
Exterior2nd_Stone           uint8
Exterior2nd_Stucco          uint8
Exterior2nd_VinylSd         uint8
Exterior2nd_Wd Sdng         uint8
Exterior2nd_Wd Shng         uint8
Exterior2nd__NA_            uint8
MasVnrType_BrkCmn           uint8
MasVnrType_BrkFace          uint8
MasVnrType_None             uint8
MasVnrType_Stone            uint8
MasVnrType__NA_             uint8
ExterQual_Ex                uint8
ExterQual_Fa                uint8
ExterQual_Gd                uint8
ExterQual_TA                uint8
ExterCond_Ex                uint8
ExterCond_Fa                uint8
ExterCond_Gd                uint8
ExterCond_Po                uint8
ExterCond_TA                uint8
Foundation_BrkTil           uint8
Foundation_CBlock           uint8
Foundation_PConc            uint8
Foundation_Slab             uint8
Foundation_Stone            uint8
Foundation_Wood             uint8
BsmtQual_Ex                 uint8
BsmtQual_Fa                 uint8
BsmtQual_Gd                 uint8
BsmtQual_TA                 uint8
BsmtQual__NA_               uint8
BsmtCond_Fa                 uint8
BsmtCond_Gd                 uint8
BsmtCond_Po                 uint8
BsmtCond_TA                 uint8
BsmtCond__NA_               uint8
BsmtExposure_Av             uint8
BsmtExposure_Gd             uint8
BsmtExposure_Mn             uint8
BsmtExposure_No             uint8
BsmtExposure__NA_           uint8
BsmtFinType1_ALQ            uint8
BsmtFinType1_BLQ            uint8
BsmtFinType1_GLQ            uint8
BsmtFinType1_LwQ            uint8
BsmtFinType1_Rec            uint8
BsmtFinType1_Unf            uint8
BsmtFinType1__NA_           uint8
BsmtFinType2_ALQ            uint8
BsmtFinType2_BLQ            uint8
BsmtFinType2_GLQ            uint8
BsmtFinType2_LwQ            uint8
BsmtFinType2_Rec            uint8
BsmtFinType2_Unf            uint8
BsmtFinType2__NA_           uint8
Heating_Floor               uint8
Heating_GasA                uint8
Heating_GasW                uint8
Heating_Grav                uint8
Heating_OthW                uint8
Heating_Wall                uint8
HeatingQC_Ex                uint8
HeatingQC_Fa                uint8
HeatingQC_Gd                uint8
HeatingQC_Po                uint8
HeatingQC_TA                uint8
CentralAir_N                uint8
CentralAir_Y                uint8
Electrical_FuseA            uint8
Electrical_FuseF            uint8
Electrical_FuseP            uint8
Electrical_Mix              uint8
Electrical_SBrkr            uint8
Electrical__NA_             uint8
KitchenQual_Ex              uint8
KitchenQual_Fa              uint8
KitchenQual_Gd              uint8
KitchenQual_TA              uint8
KitchenQual__NA_            uint8
Functional_Maj1             uint8
Functional_Maj2             uint8
Functional_Min1             uint8
Functional_Min2             uint8
Functional_Mod              uint8
Functional_Sev              uint8
Functional_Typ              uint8
Functional__NA_             uint8
FireplaceQu_Ex              uint8
FireplaceQu_Fa              uint8
FireplaceQu_Gd              uint8
FireplaceQu_Po              uint8
FireplaceQu_TA              uint8
FireplaceQu__NA_            uint8
GarageType_2Types           uint8
GarageType_Attchd           uint8
GarageType_Basment          uint8
GarageType_BuiltIn          uint8
GarageType_CarPort          uint8
GarageType_Detchd           uint8
GarageType__NA_             uint8
GarageFinish_Fin            uint8
GarageFinish_RFn            uint8
GarageFinish_Unf            uint8
GarageFinish__NA_           uint8
GarageQual_Ex               uint8
GarageQual_Fa               uint8
GarageQual_Gd               uint8
GarageQual_Po               uint8
GarageQual_TA               uint8
GarageQual__NA_             uint8
GarageCond_Ex               uint8
GarageCond_Fa               uint8
GarageCond_Gd               uint8
GarageCond_Po               uint8
GarageCond_TA               uint8
GarageCond__NA_             uint8
PavedDrive_N                uint8
PavedDrive_P                uint8
PavedDrive_Y                uint8
PoolQC_Ex                   uint8
PoolQC_Fa                   uint8
PoolQC_Gd                   uint8
PoolQC__NA_                 uint8
Fence_GdPrv                 uint8
Fence_GdWo                  uint8
Fence_MnPrv                 uint8
Fence_MnWw                  uint8
Fence__NA_                  uint8
MiscFeature_Gar2            uint8
MiscFeature_Othr            uint8
MiscFeature_Shed            uint8
MiscFeature_TenC            uint8
MiscFeature__NA_            uint8
SaleType_COD                uint8
SaleType_CWD                uint8
SaleType_Con                uint8
SaleType_ConLD              uint8
SaleType_ConLI              uint8
SaleType_ConLw              uint8
SaleType_New                uint8
SaleType_Oth                uint8
SaleType_WD                 uint8
SaleType__NA_               uint8
SaleCondition_Abnorml       uint8
SaleCondition_AdjLand       uint8
SaleCondition_Alloca        uint8
SaleCondition_Family        uint8
SaleCondition_Normal        uint8
SaleCondition_Partial       uint8
SalePrice                   int64
Log1p_SalePrice           float64
StandardScal_SalePrice    float64
MaxMinScal_SalePrice      float64
dtype: object
In [42]:
# Finding numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in df_train.columns:
    if df_train[i].dtype in numeric_dtypes:
            numeric.append(i)     
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 200))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train[numeric]), 1):
    if(feature=='MiscVal'):
        break
    plt.subplot(len(list(numeric)), 3, i)
    sns.scatterplot(x=feature, y='Log1p_SalePrice', hue='Log1p_SalePrice', palette='Blues', data=df_train)
        
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('Log1p_SalePrice', size=15, labelpad=12.5)
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(loc='best', prop={'size': 10})
        
plt.show()

Figure 7.1: scatter plots of continuous variables versus Log1p(sale price)

Figure 7.1 Junk Model with only one variable OverallQual

In [44]:
# junk model with OverallQual only
from google.colab import files
saleprice_overallqual = df_train.groupby(['OverallQual'])['SalePrice'].mean()
print(saleprice_overallqual)
df_test_junk = pd.merge(df_test[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
df_test_junk.describe()
df_test_junk[['Id', 'SalePrice']].to_csv('OverallQual_SalePrice.csv', index=False)
#files.download('OverallQual_SalePrice.csv')
OverallQual
1     50150.000
2     51770.333
3     87473.750
4    108420.655
5    133523.348
6    161603.035
7    207608.613
8    274735.536
9    367513.023
10   471865.062
Name: SalePrice, dtype: float64

Figure 6.9 Average SalePrice by OverallQual

In [45]:
# split df_train dataset
X_train, X_test, y_train, y_test = train_test_split(df_train[df_train.columns.drop(list(df_train.filter(regex='SalePrice')))], df_train[['Log1p_SalePrice']], test_size=0.2, random_state=321)
In [46]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
X_train.head()

sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.histplot(df_train['Log1p_SalePrice'],kde=True)
plt.title("Histogram for Log1p_SalePrice")
# Skew and kurt
print("Skewness: %f" % df_train['Log1p_SalePrice'].skew())
print("Kurtosis: %f" % df_train['Log1p_SalePrice'].kurt())
(1165, 363)
(292, 363)
(1165, 1)
(292, 1)
Skewness: 0.123012
Kurtosis: 0.806486

Figure 6.3: Histogram of Log1p_SalePrice

In [47]:
# score junk model
x_train_junk = pd.merge(X_train[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
print(np.sqrt(mean_squared_error(y_train['Log1p_SalePrice'], np.log1p(x_train_junk['SalePrice'])))) #0.22977945196526697
x_test_junk =  pd.merge(X_test[['Id','OverallQual']],saleprice_overallqual, how='left', on='OverallQual')
print(np.sqrt(mean_squared_error(y_test['Log1p_SalePrice'], np.log1p(x_test_junk['SalePrice'])))) #0.21681551289920958
#print(y_train['Log1p_SalePrice'].head())
0.22977945196526697
0.21681551289920958
In [48]:
residuals = y_test['Log1p_SalePrice']- np.log1p(x_test_junk['SalePrice'])
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=np.log1p(x_test_junk['SalePrice']))))

# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = np.log1p(x_test_junk['SalePrice'])
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')

p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
-0.01485588320252118
R squared: 0.6797714824298213

Figure 7.2 Heteroscedasticity and normality of residuals

junkmodel_OverallQual.png

Figure 9.1: Junk model in Kaggle

Figure 6.1 split train and test on the training dataset

In [49]:
list(df_train.filter(regex='SalePrice'))+['Id']
Out[49]:
['SalePrice',
 'Log1p_SalePrice',
 'StandardScal_SalePrice',
 'MaxMinScal_SalePrice',
 'Id']

Figure 6.2: cross validation

In [50]:
# Setup cross validation folds
kf = KFold(n_splits=10, random_state=321, shuffle=True)
# Define error metrics
def k_rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X_train, y_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

scores = {}
In [51]:
# LinearRegression
names=pd.DataFrame(X_train.columns)
linear = LinearRegression().fit(X_train, y_train)
score = cv_rmse(linear,X_train, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
linear: 0.1950 (0.0884)
In [52]:
print(pd.DataFrame(linear.coef_.transpose()).shape)
print(names.shape)
#X_train.columns.shape
coeff_df = pd.concat([pd.DataFrame(linear.coef_.transpose(),columns=['coeff']), pd.DataFrame(X_train.columns,columns=['colname'])], axis=1)
coeff_df.head(10)
(363, 1)
(363, 1)
Out[52]:
coeff colname
0 -0.000 Id
1 -0.003 MSSubClass
2 -0.000 LotFrontage
3 0.143 LotArea
4 0.022 OverallQual
5 0.039 OverallCond
6 0.002 YearBuilt
7 15.324 YearRemodAdd
8 0.004 MasVnrArea
9 -0.001 BsmtFinSF1
In [53]:
## creating function to get model statistics
def get_stats():
    results = sm.OLS(y_train, X_train).fit()
    print(results.summary())
get_stats()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        Log1p_SalePrice   R-squared:                       0.957
Model:                            OLS   Adj. R-squared:                  0.942
Method:                 Least Squares   F-statistic:                     65.97
Date:                Thu, 07 Oct 2021   Prob (F-statistic):               0.00
Time:                        15:37:15   Log-Likelihood:                 1232.8
No. Observations:                1165   AIC:                            -1880.
Df Residuals:                     872   BIC:                            -396.9
Df Model:                         292                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Id                    -4.075e-06   7.87e-06     -0.518      0.605   -1.95e-05    1.14e-05
MSSubClass               -0.0033      0.010     -0.334      0.738      -0.023       0.016
LotFrontage              -0.0002      0.002     -0.128      0.898      -0.003       0.003
LotArea                   0.1430      0.064      2.218      0.027       0.016       0.269
OverallQual               0.0216      0.017      1.303      0.193      -0.011       0.054
OverallCond               0.0388      0.028      1.405      0.160      -0.015       0.093
YearBuilt                 0.0021      0.008      0.262      0.793      -0.014       0.018
YearRemodAdd             15.3244     27.925      0.549      0.583     -39.484      70.133
MasVnrArea                0.0036      0.002      1.661      0.097      -0.001       0.008
BsmtFinSF1               -0.0010      0.001     -1.289      0.198      -0.003       0.001
BsmtFinSF2               -0.0478      0.066     -0.721      0.471      -0.178       0.082
BsmtUnfSF                -0.0028      0.001     -2.157      0.031      -0.005      -0.000
TotalBsmtSF               0.0098      0.004      2.785      0.005       0.003       0.017
1stFlrSF                291.4036    234.286      1.244      0.214    -168.427     751.234
2ndFlrSF                  0.0005      0.000      1.107      0.268      -0.000       0.001
LowQualFinSF             -0.7889      0.682     -1.157      0.248      -2.127       0.549
GrLivArea               120.2178     83.177      1.445      0.149     -43.033     283.469
BsmtFullBath           -120.3729     80.499     -1.495      0.135    -278.368      37.622
BsmtHalfBath          -1371.3530    915.985     -1.497      0.135   -3169.146     426.440
FullBath               -112.5012     75.021     -1.500      0.134    -259.744      34.742
HalfBath                -32.7760     21.832     -1.501      0.134     -75.625      10.073
BedroomAbvGr             -0.0043      0.026     -0.167      0.867      -0.055       0.046
KitchenAbvGr             -3.7316      3.565     -1.047      0.296     -10.729       3.266
TotRmsAbvGrd              1.2000      1.011      1.187      0.236      -0.784       3.184
Fireplaces               -0.1869      0.673     -0.278      0.781      -1.509       1.135
GarageYrBlt               0.0004      0.000      1.063      0.288      -0.000       0.001
GarageCars                0.9829      1.229      0.799      0.424      -1.430       3.396
GarageArea               -0.0002      0.000     -0.330      0.741      -0.001       0.001
WoodDeckSF                0.0012      0.001      1.212      0.226      -0.001       0.003
OpenPorchSF               0.0044      0.003      1.466      0.143      -0.001       0.010
EnclosedPorch            -0.0050      0.010     -0.524      0.601      -0.024       0.014
3SsnPorch                 0.0383      0.135      0.285      0.776      -0.226       0.303
ScreenPorch               0.0079      0.005      1.709      0.088      -0.001       0.017
PoolArea               -661.8878   1239.876     -0.534      0.594   -3095.378    1771.603
MiscVal                   0.1747      0.076      2.314      0.021       0.027       0.323
MoSold                    0.0005      0.001      0.406      0.685      -0.002       0.003
YrSold                   -0.0052      0.008     -0.647      0.517      -0.021       0.011
TotalSF                  -6.8077      3.053     -2.230      0.026     -12.799      -0.816
TotalPorchSF             -0.0009      0.001     -0.967      0.334      -0.003       0.001
HouseAge                  0.0256      0.094      0.274      0.784      -0.158       0.209
QualityIndex              0.0032      0.003      1.150      0.250      -0.002       0.009
Total_Bathrooms         112.4910     75.027      1.499      0.134     -34.763     259.745
Has_Fireplaces         2926.9012   5466.176      0.535      0.592   -7801.498    1.37e+04
Has_Bsmt               6439.2374    1.2e+04      0.535      0.592   -1.72e+04       3e+04
Has_Garage             6439.2374    1.2e+04      0.535      0.592   -1.72e+04       3e+04
Has_Pool               6439.2374    1.2e+04      0.535      0.592   -1.72e+04       3e+04
Has_2ndStory           6439.2374    1.2e+04      0.535      0.592   -1.72e+04       3e+04
LotFrontage_log          -0.0233      0.067     -0.350      0.726      -0.154       0.108
LotArea_log              -1.4225      0.965     -1.474      0.141      -3.317       0.472
MasVnrArea_log           -0.0692      0.034     -2.014      0.044      -0.137      -0.002
BsmtFinSF1_log            0.0259      0.024      1.063      0.288      -0.022       0.074
BsmtFinSF2_log            0.3038      0.539      0.564      0.573      -0.754       1.361
BsmtUnfSF_log             0.0213      0.014      1.523      0.128      -0.006       0.049
TotalBsmtSF_log           1.2553      1.173      1.071      0.285      -1.046       3.557
1stFlrSF_log           1446.8894   1139.104      1.270      0.204    -788.817    3682.595
2ndFlrSF_log             -0.1531      0.085     -1.794      0.073      -0.321       0.014
LowQualFinSF_log          2.0674      1.875      1.103      0.270      -1.612       5.747
GrLivArea_log           832.7421    603.081      1.381      0.168    -350.919    2016.403
BsmtFullBath_log         10.9353      7.816      1.399      0.162      -4.405      26.275
BsmtHalfBath_log       2029.0596   1355.613      1.497      0.135    -631.586    4689.706
FullBath_log              0.0554      0.180      0.307      0.759      -0.299       0.410
HalfBath_log            -30.8307     20.687     -1.490      0.136     -71.432       9.771
BedroomAbvGr_log         -0.0016      0.092     -0.018      0.986      -0.182       0.178
KitchenAbvGr_log          7.3314      7.459      0.983      0.326      -7.309      21.972
TotRmsAbvGrd_log         -3.2991      2.866     -1.151      0.250      -8.924       2.326
Fireplaces_log            0.6077      1.934      0.314      0.753      -3.188       4.403
GarageCars_log            0.0984      0.329      0.299      0.765      -0.547       0.744
GarageArea_log           -0.4186      1.204     -0.348      0.728      -2.781       1.944
WoodDeckSF_log           -0.0040      0.008     -0.517      0.606      -0.019       0.011
OpenPorchSF_log          -0.0211      0.015     -1.431      0.153      -0.050       0.008
EnclosedPorch_log         0.0498      0.048      1.030      0.303      -0.045       0.145
3SsnPorch_log            -0.0835      0.483     -0.173      0.863      -1.032       0.865
ScreenPorch_log          -0.0402      0.036     -1.103      0.270      -0.112       0.031
PoolArea_log           4112.4404   7683.963      0.535      0.593    -1.1e+04    1.92e+04
MiscVal_log              -0.8366      0.342     -2.450      0.014      -1.507      -0.166
YearRemodAdd_log      -1.509e+04   2.76e+04     -0.546      0.585   -6.93e+04    3.92e+04
TotalSF_log            -181.1994     86.497     -2.095      0.036    -350.967     -11.432
HouseAge_log              0.0734      0.125      0.587      0.558      -0.172       0.319
YearRemodAdd_sq          -0.0019      0.004     -0.552      0.581      -0.009       0.005
LotFrontage_log_sq        0.0053      0.022      0.245      0.806      -0.037       0.047
TotalBsmtSF_log_sq       -0.1914      0.135     -1.421      0.156      -0.456       0.073
1stFlrSF_log_sq        -889.7828    708.812     -1.255      0.210   -2280.960     501.395
2ndFlrSF_log_sq           0.0228      0.013      1.691      0.091      -0.004       0.049
GrLivArea_log_sq       -428.5716    303.378     -1.413      0.158   -1024.008     166.865
GarageCars_log_sq        -1.3766      1.841     -0.748      0.455      -4.991       2.238
GarageArea_log_sq         0.0423      0.117      0.361      0.718      -0.187       0.272
TotalSF_log_sq           51.7642     24.001      2.157      0.031       4.659      98.870
HouseAge_log_sq          -0.0754      0.151     -0.498      0.618      -0.373       0.222
MSZoning_C (all)       1287.5014   2405.115      0.535      0.593   -3432.990    6007.993
MSZoning_FV            1287.9940   2405.123      0.536      0.592   -3432.512    6008.500
MSZoning_RH            1287.9376   2405.121      0.535      0.592   -3432.566    6008.441
MSZoning_RL            1287.9262   2405.121      0.535      0.592   -3432.577    6008.430
MSZoning_RM            1287.8777   2405.120      0.535      0.592   -3432.624    6008.379
MSZoning__NA_         -2.013e-05   3.74e-05     -0.538      0.591   -9.36e-05    5.33e-05
Street_Grvl            3219.5608   6012.805      0.535      0.592   -8581.700     1.5e+04
Street_Pave            3219.6771   6012.798      0.535      0.592   -8581.571     1.5e+04
Alley_Grvl             2146.4052   4008.534      0.535      0.592   -5721.097       1e+04
Alley_Pave             2146.4250   4008.536      0.535      0.592   -5721.081       1e+04
Alley__NA_             2146.4087   4008.535      0.535      0.592   -5721.095       1e+04
LotShape_IR1           1609.7977   3006.400      0.535      0.592   -4290.828    7510.423
LotShape_IR2           1609.7988   3006.398      0.535      0.592   -4290.824    7510.422
LotShape_IR3           1609.8256   3006.400      0.535      0.592   -4290.800    7510.451
LotShape_Reg           1609.8129   3006.400      0.535      0.592   -4290.812    7510.438
LandContour_Bnk        1609.8022   3006.400      0.535      0.592   -4290.823    7510.428
LandContour_HLS        1609.8281   3006.400      0.535      0.592   -4290.797    7510.454
LandContour_Low        1609.7838   3006.401      0.535      0.592   -4290.844    7510.411
LandContour_Lvl        1609.8225   3006.400      0.535      0.592   -4290.803    7510.448
Utilities_AllPub       3219.7341   6012.797      0.535      0.592   -8581.512     1.5e+04
Utilities_NoSeWa       3219.5024   6012.803      0.535      0.592   -8581.755     1.5e+04
Utilities__NA_        -7.098e-06   1.32e-05     -0.539      0.590    -3.3e-05    1.88e-05
LotConfig_Corner       1287.8688   2405.123      0.535      0.592   -3432.637    6008.374
LotConfig_CulDSac      1287.8878   2405.122      0.535      0.592   -3432.616    6008.392
LotConfig_FR2          1287.8262   2405.122      0.535      0.592   -3432.679    6008.332
LotConfig_FR3          1287.8055   2405.115      0.535      0.592   -3432.686    6008.297
LotConfig_Inside       1287.8503   2405.122      0.535      0.592   -3432.654    6008.355
LandSlope_Gtl          2146.4540   4008.534      0.535      0.592   -5721.049       1e+04
LandSlope_Mod          2146.5012   4008.535      0.535      0.592   -5721.003       1e+04
LandSlope_Sev          2146.2845   4008.537      0.535      0.592   -5721.223       1e+04
Neighborhood_Blmngtn    257.5711    481.025      0.535      0.592    -686.530    1201.672
Neighborhood_Blueste    257.5643    481.028      0.535      0.592    -686.544    1201.672
Neighborhood_BrDale     257.6206    481.029      0.536      0.592    -686.490    1201.731
Neighborhood_BrkSide    257.6118    481.023      0.536      0.592    -686.487    1201.710
Neighborhood_ClearCr    257.5673    481.023      0.535      0.592    -686.532    1201.666
Neighborhood_CollgCr    257.5435    481.022      0.535      0.593    -686.553    1201.640
Neighborhood_Crawfor    257.6817    481.024      0.536      0.592    -686.419    1201.783
Neighborhood_Edwards    257.4913    481.023      0.535      0.593    -686.607    1201.590
Neighborhood_Gilbert    257.5392    481.022      0.535      0.593    -686.557    1201.635
Neighborhood_IDOTRR     257.5648    481.024      0.535      0.592    -686.535    1201.665
Neighborhood_MeadowV    257.5290    481.030      0.535      0.593    -686.582    1201.640
Neighborhood_Mitchel    257.4906    481.023      0.535      0.593    -686.607    1201.588
Neighborhood_NAmes      257.5397    481.025      0.535      0.593    -686.563    1201.642
Neighborhood_NPkVill    257.5652    481.025      0.535      0.592    -686.536    1201.667
Neighborhood_NWAmes     257.5239    481.025      0.535      0.593    -686.579    1201.627
Neighborhood_NoRidge    257.6032    481.022      0.536      0.592    -686.492    1201.699
Neighborhood_NridgHt    257.6163    481.023      0.536      0.592    -686.482    1201.714
Neighborhood_OldTown    257.5464    481.025      0.535      0.593    -686.556    1201.649
Neighborhood_SWISU      257.5779    481.023      0.535      0.592    -686.519    1201.675
Neighborhood_Sawyer     257.5483    481.024      0.535      0.592    -686.552    1201.648
Neighborhood_SawyerW    257.5544    481.022      0.535      0.592    -686.541    1201.650
Neighborhood_Somerst    257.5555    481.023      0.535      0.592    -686.542    1201.653
Neighborhood_StoneBr    257.7085    481.024      0.536      0.592    -686.391    1201.808
Neighborhood_Timber     257.5593    481.022      0.535      0.592    -686.537    1201.655
Neighborhood_Veenker    257.5626    481.022      0.535      0.592    -686.534    1201.659
Condition1_Artery       715.4293   1336.176      0.535      0.592   -1907.068    3337.926
Condition1_Feedr        715.4704   1336.179      0.535      0.592   -1907.031    3337.972
Condition1_Norm         715.5173   1336.178      0.535      0.592   -1906.984    3338.019
Condition1_PosA         715.4312   1336.172      0.535      0.592   -1907.058    3337.920
Condition1_PosN         715.5489   1336.179      0.536      0.592   -1906.954    3338.052
Condition1_RRAe         715.3378   1336.178      0.535      0.593   -1907.163    3337.838
Condition1_RRAn         715.4627   1336.178      0.535      0.592   -1907.037    3337.963
Condition1_RRNe         715.5220   1336.175      0.536      0.592   -1906.973    3338.017
Condition1_RRNn         715.5130   1336.178      0.535      0.592   -1906.989    3338.015
Condition2_Artery       785.2697   1466.535      0.535      0.592   -2093.081    3663.621
Condition2_Feedr        785.3382   1466.541      0.536      0.592   -2093.025    3663.701
Condition2_Norm         785.2917   1466.536      0.535      0.592   -2093.062    3663.645
Condition2_PosA         785.4001   1466.531      0.536      0.592   -2092.944    3663.744
Condition2_PosN         785.2005   1466.538      0.535      0.593   -2093.157    3663.558
Condition2_RRAe         942.2867   1759.848      0.535      0.592   -2511.747    4396.320
Condition2_RRAn         785.1470   1466.535      0.535      0.593   -2093.205    3663.499
Condition2_RRNn         785.3045   1466.538      0.535      0.592   -2093.052    3663.661
BldgType_1Fam          1287.8561   2405.118      0.535      0.592   -3432.641    6008.353
BldgType_2fmCon        1287.8445   2405.118      0.535      0.592   -3432.653    6008.342
BldgType_Duplex        1287.8324   2405.119      0.535      0.592   -3432.666    6008.331
BldgType_Twnhs         1287.8402   2405.120      0.535      0.592   -3432.660    6008.341
BldgType_TwnhsE        1287.8604   2405.119      0.535      0.592   -3432.639    6008.360
HouseStyle_1.5Fin       804.9389   1503.199      0.535      0.592   -2145.372    3755.250
HouseStyle_1.5Unf       804.9080   1503.199      0.535      0.592   -2145.403    3755.219
HouseStyle_1Story       804.8807   1503.199      0.535      0.592   -2145.430    3755.192
HouseStyle_2.5Fin       804.8752   1503.203      0.535      0.592   -2145.444    3755.195
HouseStyle_2.5Unf       804.9446   1503.197      0.535      0.592   -2145.363    3755.252
HouseStyle_2Story       804.8798   1503.200      0.535      0.592   -2145.433    3755.193
HouseStyle_SFoyer       804.9061   1503.201      0.535      0.592   -2145.409    3755.221
HouseStyle_SLvl         804.9025   1503.201      0.535      0.592   -2145.411    3755.216
RoofStyle_Flat         1099.3487   2053.150      0.535      0.592   -2930.344    5129.042
RoofStyle_Gable        1099.3934   2053.152      0.535      0.592   -2930.303    5129.090
RoofStyle_Gambrel      1099.3764   2053.150      0.535      0.592   -2930.317    5129.070
RoofStyle_Hip          1099.3899   2053.152      0.535      0.592   -2930.307    5129.086
RoofStyle_Mansard      1099.4426   2053.151      0.535      0.592   -2930.252    5129.137
RoofStyle_Shed          942.2867   1759.848      0.535      0.592   -2511.747    4396.320
RoofMatl_ClyTile      -2.456e-06   4.57e-06     -0.538      0.591   -1.14e-05    6.51e-06
RoofMatl_CompShg        919.7930   1717.942      0.535      0.593   -2451.992    4291.578
RoofMatl_Membran        920.1470   1717.943      0.536      0.592   -2451.640    4291.934
RoofMatl_Metal          919.9876   1717.946      0.536      0.592   -2451.805    4291.780
RoofMatl_Roll           919.8491   1717.935      0.535      0.592   -2451.921    4291.620
RoofMatl_Tar&Grv        919.8167   1717.946      0.535      0.592   -2451.975    4291.608
RoofMatl_WdShake        919.7930   1717.944      0.535      0.593   -2451.995    4291.581
RoofMatl_WdShngl        919.8500   1717.944      0.535      0.592   -2451.938    4291.639
Exterior1st_AsbShng     495.4028    925.048      0.536      0.592   -1320.177    2310.983
Exterior1st_AsphShn     6.53e-06   1.21e-05      0.539      0.590   -1.73e-05    3.03e-05
Exterior1st_BrkComm     495.0680    925.043      0.535      0.593   -1320.502    2310.639
Exterior1st_BrkFace     495.4275    925.048      0.536      0.592   -1320.153    2311.008
Exterior1st_CBlock    -4.323e-06   8.03e-06     -0.538      0.590   -2.01e-05    1.14e-05
Exterior1st_CemntBd     495.2219    925.042      0.535      0.593   -1320.348    2310.792
Exterior1st_HdBoard     495.3272    925.048      0.535      0.592   -1320.253    2310.908
Exterior1st_ImStucc     495.3491    925.047      0.535      0.592   -1320.229    2310.927
Exterior1st_MetalSd     495.4000    925.047      0.536      0.592   -1320.178    2310.979
Exterior1st_Plywood     495.3222    925.047      0.535      0.592   -1320.257    2310.902
Exterior1st_Stone       495.4100    925.049      0.536      0.592   -1320.174    2310.994
Exterior1st_Stucco      495.3994    925.046      0.536      0.592   -1320.177    2310.976
Exterior1st_VinylSd     495.3020    925.047      0.535      0.592   -1320.278    2310.882
Exterior1st_Wd Sdng     495.2957    925.048      0.535      0.592   -1320.286    2310.877
Exterior1st_WdShing     495.3138    925.046      0.535      0.592   -1320.263    2310.891
Exterior1st__NA_       8.856e-06   1.65e-05      0.538      0.591   -2.34e-05    4.12e-05
Exterior2nd_AsbShng     429.2063    801.704      0.535      0.593   -1144.288    2002.701
Exterior2nd_AsphShn     429.3369    801.711      0.536      0.592   -1144.173    2002.847
Exterior2nd_Brk Cmn     429.3589    801.709      0.536      0.592   -1144.145    2002.863
Exterior2nd_BrkFace     429.2343    801.705      0.535      0.593   -1144.263    2002.732
Exterior2nd_CBlock    -1.452e-06   2.69e-06     -0.540      0.590   -6.73e-06    3.83e-06
Exterior2nd_CmentBd     429.4008    801.712      0.536      0.592   -1144.111    2002.912
Exterior2nd_HdBoard     429.2796    801.707      0.535      0.592   -1144.221    2002.780
Exterior2nd_ImStucc     429.2466    801.707      0.535      0.592   -1144.254    2002.748
Exterior2nd_MetalSd     429.2264    801.707      0.535      0.593   -1144.274    2002.727
Exterior2nd_Other       429.2246    801.705      0.535      0.593   -1144.273    2002.722
Exterior2nd_Plywood     429.2854    801.708      0.535      0.592   -1144.218    2002.789
Exterior2nd_Stone       429.2520    801.703      0.535      0.592   -1144.242    2002.746
Exterior2nd_Stucco      429.2389    801.705      0.535      0.593   -1144.258    2002.736
Exterior2nd_VinylSd     429.3303    801.707      0.536      0.592   -1144.171    2002.832
Exterior2nd_Wd Sdng     429.3214    801.706      0.536      0.592   -1144.177    2002.820
Exterior2nd_Wd Shng     429.2968    801.708      0.535      0.592   -1144.206    2002.800
Exterior2nd__NA_      -3.645e-06   6.77e-06     -0.538      0.591   -1.69e-05    9.65e-06
MasVnrType_BrkCmn      1287.8745   2405.119      0.535      0.592   -3432.625    6008.374
MasVnrType_BrkFace     1287.8984   2405.119      0.535      0.592   -3432.600    6008.396
MasVnrType_None        1287.7971   2405.119      0.535      0.592   -3432.701    6008.296
MasVnrType_Stone       1287.9348   2405.119      0.535      0.592   -3432.565    6008.434
MasVnrType__NA_        1287.7293   2405.119      0.535      0.593   -3432.770    6008.229
ExterQual_Ex           1609.7909   3006.403      0.535      0.592   -4290.842    7510.423
ExterQual_Fa           1609.8495   3006.397      0.535      0.592   -4290.771    7510.470
ExterQual_Gd           1609.7965   3006.403      0.535      0.592   -4290.834    7510.427
ExterQual_TA           1609.8018   3006.401      0.535      0.592   -4290.826    7510.430
ExterCond_Ex           1609.8806   3006.405      0.535      0.592   -4290.756    7510.517
ExterCond_Fa           1609.7657   3006.401      0.535      0.592   -4290.862    7510.393
ExterCond_Gd           1609.7853   3006.399      0.535      0.592   -4290.838    7510.409
ExterCond_Po          -2.087e-06   3.88e-06     -0.537      0.591   -9.71e-06    5.53e-06
ExterCond_TA           1609.8066   3006.399      0.535      0.592   -4290.816    7510.430
Foundation_BrkTil      1073.1863   2004.266      0.535      0.592   -2860.563    5006.936
Foundation_CBlock      1073.2146   2004.266      0.535      0.592   -2860.535    5006.964
Foundation_PConc       1073.2213   2004.266      0.535      0.592   -2860.527    5006.970
Foundation_Slab        1073.2575   2004.267      0.535      0.592   -2860.494    5007.009
Foundation_Stone       1073.3166   2004.267      0.536      0.592   -2860.435    5007.068
Foundation_Wood        1073.0410   2004.269      0.535      0.593   -2860.714    5006.796
BsmtQual_Ex            1317.0120   2459.773      0.535      0.592   -3510.755    6144.779
BsmtQual_Fa            1316.9811   2459.774      0.535      0.593   -3510.788    6144.750
BsmtQual_Gd            1316.9711   2459.772      0.535      0.593   -3510.795    6144.738
BsmtQual_TA            1316.9736   2459.775      0.535      0.593   -3510.797    6144.744
BsmtQual__NA_          1171.2987   2186.507      0.536      0.592   -3120.132    5462.730
BsmtCond_Fa            1316.9416   2459.778      0.535      0.593   -3510.835    6144.718
BsmtCond_Gd            1316.9503   2459.778      0.535      0.593   -3510.826    6144.727
BsmtCond_Po            1317.0867   2459.764      0.535      0.592   -3510.662    6144.836
BsmtCond_TA            1316.9612   2459.778      0.535      0.593   -3510.816    6144.739
BsmtCond__NA_          1171.2987   2186.507      0.536      0.592   -3120.132    5462.730
BsmtExposure_Av        1287.8530   2405.121      0.535      0.592   -3432.650    6008.356
BsmtExposure_Gd        1287.8894   2405.122      0.535      0.592   -3432.615    6008.394
BsmtExposure_Mn        1287.8703   2405.121      0.535      0.592   -3432.633    6008.373
BsmtExposure_No        1287.8529   2405.121      0.535      0.592   -3432.650    6008.356
BsmtExposure__NA_      1287.7741   2405.121      0.535      0.592   -3432.728    6008.276
BsmtFinType1_ALQ        877.9881   1639.849      0.535      0.593   -2340.524    4096.501
BsmtFinType1_BLQ        877.9921   1639.849      0.535      0.593   -2340.519    4096.504
BsmtFinType1_GLQ        878.0031   1639.849      0.535      0.592   -2340.508    4096.514
BsmtFinType1_LwQ        877.9684   1639.848      0.535      0.593   -2340.542    4096.479
BsmtFinType1_Rec        877.9624   1639.849      0.535      0.593   -2340.549    4096.474
BsmtFinType1_Unf        878.0220   1639.848      0.535      0.592   -2340.487    4096.531
BsmtFinType1__NA_      1171.2987   2186.507      0.536      0.592   -3120.132    5462.730
BsmtFinType2_ALQ        919.8933   1717.943      0.535      0.592   -2451.893    4291.679
BsmtFinType2_BLQ        919.8631   1717.943      0.535      0.592   -2451.922    4291.649
BsmtFinType2_GLQ        919.9224   1717.944      0.535      0.592   -2451.866    4291.711
BsmtFinType2_LwQ        919.8713   1717.944      0.535      0.592   -2451.917    4291.659
BsmtFinType2_Rec        919.8757   1717.941      0.535      0.592   -2451.907    4291.658
BsmtFinType2_Unf        920.0651   1717.953      0.536      0.592   -2451.741    4291.871
BsmtFinType2__NA_       919.7471   1717.936      0.535      0.593   -2452.026    4291.520
Heating_Floor          1073.2430   2004.259      0.535      0.592   -2860.493    5006.978
Heating_GasA           1073.2425   2004.267      0.535      0.592   -2860.508    5006.993
Heating_GasW           1073.2910   2004.271      0.536      0.592   -2860.469    5007.051
Heating_Grav           1073.0303   2004.266      0.535      0.593   -2860.720    5006.780
Heating_OthW           1073.2624   2004.275      0.535      0.592   -2860.504    5007.028
Heating_Wall           1073.1669   2004.261      0.535      0.592   -2860.573    5006.907
HeatingQC_Ex           1287.8632   2405.121      0.535      0.592   -3432.640    6008.366
HeatingQC_Fa           1287.8682   2405.119      0.535      0.592   -3432.630    6008.366
HeatingQC_Gd           1287.8420   2405.121      0.535      0.592   -3432.660    6008.344
HeatingQC_Po           1287.8349   2405.119      0.535      0.592   -3432.664    6008.334
HeatingQC_TA           1287.8286   2405.121      0.535      0.592   -3432.674    6008.332
CentralAir_N           3219.6031   6012.799      0.535      0.592   -8581.646     1.5e+04
CentralAir_Y           3219.6325   6012.800      0.535      0.592   -8581.619     1.5e+04
Electrical_FuseA       1073.2358   2004.266      0.535      0.592   -2860.513    5006.984
Electrical_FuseF       1073.2647   2004.265      0.535      0.592   -2860.483    5007.013
Electrical_FuseP       1073.1299   2004.270      0.535      0.592   -2860.627    5006.887
Electrical_Mix         1073.1092   2004.270      0.535      0.593   -2860.647    5006.866
Electrical_SBrkr       1073.2231   2004.266      0.535      0.592   -2860.527    5006.973
Electrical__NA_        1073.2780   2004.271      0.535      0.592   -2860.482    5007.038
KitchenQual_Ex         1609.8646   3006.401      0.535      0.592   -4290.762    7510.491
KitchenQual_Fa         1609.7968   3006.401      0.535      0.592   -4290.830    7510.424
KitchenQual_Gd         1609.7927   3006.400      0.535      0.592   -4290.833    7510.419
KitchenQual_TA         1609.7833   3006.401      0.535      0.592   -4290.844    7510.411
KitchenQual__NA_      -4.367e-11   8.13e-11     -0.537      0.591   -2.03e-10    1.16e-10
Functional_Maj1         919.9299   1717.943      0.535      0.592   -2451.857    4291.717
Functional_Maj2         919.7648   1717.948      0.535      0.593   -2452.032    4291.562
Functional_Min1         919.9634   1717.941      0.536      0.592   -2451.819    4291.746
Functional_Min2         919.9965   1717.944      0.536      0.592   -2451.791    4291.785
Functional_Mod          919.8649   1717.943      0.535      0.592   -2451.921    4291.651
Functional_Sev          919.7029   1717.938      0.535      0.593   -2452.074    4291.479
Functional_Typ          920.0144   1717.944      0.536      0.592   -2451.774    4291.802
Functional__NA_                0          0        nan        nan           0           0
FireplaceQu_Ex          585.3576   1093.236      0.535      0.592   -1560.323    2731.038
FireplaceQu_Fa          585.3724   1093.235      0.535      0.592   -1560.308    2731.053
FireplaceQu_Gd          585.3804   1093.235      0.535      0.592   -1560.299    2731.060
FireplaceQu_Po          585.3958   1093.237      0.535      0.592   -1560.287    2731.079
FireplaceQu_TA          585.3956   1093.234      0.535      0.592   -1560.282    2731.073
FireplaceQu__NA_       3512.3352   6559.424      0.535      0.592   -9361.769    1.64e+04
GarageType_2Types       875.9982   1636.127      0.535      0.593   -2335.208    4087.205
GarageType_Attchd       876.1048   1636.131      0.535      0.592   -2335.111    4087.321
GarageType_Basment      876.1186   1636.131      0.535      0.592   -2335.097    4087.334
GarageType_BuiltIn      876.0969   1636.132      0.535      0.592   -2335.120    4087.314
GarageType_CarPort      876.1025   1636.133      0.535      0.592   -2335.117    4087.321
GarageType_Detchd       876.1087   1636.131      0.535      0.592   -2335.105    4087.323
GarageType__NA_        1182.7074   2208.817      0.535      0.592   -3152.511    5517.926
GarageFinish_Fin       1752.1833   3272.261      0.535      0.592   -4670.244    8174.611
GarageFinish_RFn       1752.1791   3272.261      0.535      0.592   -4670.248    8174.607
GarageFinish_Unf       1752.1654   3272.260      0.535      0.592   -4670.260    8174.591
GarageFinish__NA_      1182.7074   2208.817      0.535      0.592   -3152.511    5517.926
GarageQual_Ex          1051.4626   1963.355      0.536      0.592   -2801.991    4904.916
GarageQual_Fa          1051.2675   1963.358      0.535      0.592   -2802.192    4904.727
GarageQual_Gd          1051.2927   1963.360      0.535      0.592   -2802.171    4904.756
GarageQual_Po          1051.2223   1963.352      0.535      0.592   -2802.226    4904.670
GarageQual_TA          1051.2833   1963.357      0.535      0.592   -2802.175    4904.742
GarageQual__NA_        1182.7074   2208.817      0.535      0.592   -3152.511    5517.926
GarageCond_Ex          1051.2216   1963.360      0.535      0.592   -2802.241    4904.685
GarageCond_Fa          1051.2458   1963.357      0.535      0.592   -2802.211    4904.703
GarageCond_Gd          1051.3218   1963.355      0.535      0.592   -2802.132    4904.776
GarageCond_Po          1051.4127   1963.358      0.536      0.592   -2802.048    4904.873
GarageCond_TA          1051.3281   1963.355      0.535      0.592   -2802.126    4904.783
GarageCond__NA_        1182.7074   2208.817      0.535      0.592   -3152.511    5517.926
PavedDrive_N           2146.4122   4008.532      0.535      0.592   -5721.086       1e+04
PavedDrive_P           2146.3972   4008.532      0.535      0.592   -5721.101       1e+04
PavedDrive_Y           2146.4245   4008.532      0.535      0.592   -5721.073       1e+04
PoolQC_Ex              1357.8106   2539.564      0.535      0.593   -3626.562    6342.183
PoolQC_Fa                      0          0        nan        nan           0           0
PoolQC_Gd              1357.5416   2539.131      0.535      0.593   -3625.981    6341.064
PoolQC__NA_            3723.8807   6946.900      0.536      0.592   -9910.717    1.74e+04
Fence_GdPrv            1287.8613   2405.120      0.535      0.592   -3432.640    6008.362
Fence_GdWo             1287.8169   2405.121      0.535      0.592   -3432.686    6008.320
Fence_MnPrv            1287.8655   2405.122      0.535      0.592   -3432.639    6008.370
Fence_MnWw             1287.8297   2405.120      0.535      0.592   -3432.670    6008.330
Fence__NA_             1287.8652   2405.122      0.535      0.592   -3432.639    6008.369
MiscFeature_Gar2       1609.7123   3006.397      0.535      0.592   -4290.908    7510.332
MiscFeature_Othr       1609.8607   3006.393      0.535      0.592   -4290.752    7510.473
MiscFeature_Shed       1609.9453   3006.409      0.536      0.592   -4290.699    7510.589
MiscFeature_TenC               0          0        nan        nan           0           0
MiscFeature__NA_       1609.7186   3006.402      0.535      0.592   -4290.910    7510.347
SaleType_COD            804.8930   1503.201      0.535      0.592   -2145.423    3755.209
SaleType_CWD            804.9438   1503.200      0.535      0.592   -2145.369    3755.256
SaleType_Con                   0          0        nan        nan           0           0
SaleType_ConLD          805.0210   1503.200      0.536      0.592   -2145.291    3755.333
SaleType_ConLI          804.9164   1503.201      0.535      0.592   -2145.399    3755.232
SaleType_ConLw          804.8769   1503.204      0.535      0.592   -2145.443    3755.197
SaleType_New            804.9593   1503.205      0.535      0.592   -2145.363    3755.282
SaleType_Oth            804.7751   1503.197      0.535      0.593   -2145.533    3755.083
SaleType_WD             804.8562   1503.201      0.535      0.592   -2145.459    3755.172
SaleType__NA_                  0          0        nan        nan           0           0
SaleCondition_Abnorml  1073.1504   2004.270      0.535      0.592   -2860.606    5006.907
SaleCondition_AdjLand  1073.3268   2004.265      0.536      0.592   -2860.420    5007.074
SaleCondition_Alloca   1073.1994   2004.267      0.535      0.592   -2860.552    5006.951
SaleCondition_Family   1073.1879   2004.269      0.535      0.592   -2860.567    5006.943
SaleCondition_Normal   1073.2275   2004.270      0.535      0.592   -2860.529    5006.984
SaleCondition_Partial  1073.1486   2004.267      0.535      0.592   -2860.603    5006.900
==============================================================================
Omnibus:                      383.247   Durbin-Watson:                   1.934
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3497.824
Skew:                          -1.249   Prob(JB):                         0.00
Kurtosis:                      11.113   Cond. No.                     1.16e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.35e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [54]:
# Make predictions using the testing set
x_test_linear = pd.DataFrame()
print(pd.DataFrame(linear.predict(X_test),columns=['Log1p_SalePrice']).head())
x_test_linear = pd.DataFrame(linear.predict(X_test),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.2206744703562279
   Log1p_SalePrice
0           11.753
1           11.561
2           11.770
3           11.898
4           12.097
0.22000034882470196
In [55]:
#use linear regression as the model
lr = LinearRegression()
rfe_mod =  RFECV(lr, step=1, cv=10) 
linear2=rfe_mod.fit(X_train, y_train) #to fit
In [56]:
# Make predictions using the testing set
x_test_linear2 = pd.DataFrame()
linear_test2 = linear2.predict(X_test)
x_test_linear2['Log1p_SalePrice'] = linear_test2
print(np.sqrt(mean_squared_error(y_test, x_test_linear2['Log1p_SalePrice']))) #0.18839622722363722

# check out which features were selected
rfe_mod.support_ 
var_important = pd.DataFrame(rfe_mod.support_,index=X_train.columns,columns=['Rank'])
print(var_important[var_important['Rank']==True])
0.18839622722363722
                      Rank
1stFlrSF              True
GrLivArea             True
BsmtFullBath          True
BsmtHalfBath          True
FullBath              True
HalfBath              True
KitchenAbvGr          True
TotRmsAbvGrd          True
GarageCars            True
PoolArea              True
MiscVal               True
TotalSF               True
Total_Bathrooms       True
LotArea_log           True
1stFlrSF_log          True
GrLivArea_log         True
BsmtHalfBath_log      True
KitchenAbvGr_log      True
TotRmsAbvGrd_log      True
PoolArea_log          True
MiscVal_log           True
YearRemodAdd_log      True
TotalSF_log           True
1stFlrSF_log_sq       True
GrLivArea_log_sq      True
GarageCars_log_sq     True
TotalSF_log_sq        True
MSZoning_C (all)      True
LandSlope_Sev         True
Neighborhood_Crawfor  True
Neighborhood_StoneBr  True
Condition1_RRAe       True
Condition2_RRAe       True
Condition2_RRAn       True
RoofMatl_Membran      True
RoofMatl_Metal        True
Exterior1st_BrkComm   True
Heating_Grav          True
KitchenQual_Ex        True
Functional_Sev        True
Functional_Typ        True
GarageType_2Types     True
GarageQual_Ex         True
GarageCond_Fa         True
PoolQC_Ex             True
PoolQC_Gd             True
PoolQC__NA_           True
In [57]:
var_imp = var_important[var_important['Rank']==True].index.tolist()
print(var_imp)
['1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageCars', 'PoolArea', 'MiscVal', 'TotalSF', 'Total_Bathrooms', 'LotArea_log', '1stFlrSF_log', 'GrLivArea_log', 'BsmtHalfBath_log', 'KitchenAbvGr_log', 'TotRmsAbvGrd_log', 'PoolArea_log', 'MiscVal_log', 'YearRemodAdd_log', 'TotalSF_log', '1stFlrSF_log_sq', 'GrLivArea_log_sq', 'GarageCars_log_sq', 'TotalSF_log_sq', 'MSZoning_C (all)', 'LandSlope_Sev', 'Neighborhood_Crawfor', 'Neighborhood_StoneBr', 'Condition1_RRAe', 'Condition2_RRAe', 'Condition2_RRAn', 'RoofMatl_Membran', 'RoofMatl_Metal', 'Exterior1st_BrkComm', 'Heating_Grav', 'KitchenQual_Ex', 'Functional_Sev', 'Functional_Typ', 'GarageType_2Types', 'GarageQual_Ex', 'GarageCond_Fa', 'PoolQC_Ex', 'PoolQC_Gd', 'PoolQC__NA_']
In [58]:
linear2 = LinearRegression().fit(X_train[var_imp], y_train)
score = cv_rmse(linear,X_train[var_imp], y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
linear: 0.1560 (0.0202)
In [59]:
## creating function to get model statistics
def get_stats():
    results = sm.OLS(y_train, X_train[var_imp]).fit()
    print(results.summary())
get_stats()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        Log1p_SalePrice   R-squared:                       0.894
Model:                            OLS   Adj. R-squared:                  0.889
Method:                 Least Squares   F-statistic:                     208.7
Date:                Thu, 07 Oct 2021   Prob (F-statistic):               0.00
Time:                        15:39:50   Log-Likelihood:                 708.76
No. Observations:                1165   AIC:                            -1326.
Df Residuals:                    1119   BIC:                            -1093.
Df Model:                          45                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
1stFlrSF                 6.8608     85.441      0.080      0.936    -160.782     174.504
GrLivArea              -13.4637     29.690     -0.453      0.650     -71.718      44.791
BsmtFullBath            -1.2995      1.207     -1.077      0.282      -3.667       1.068
BsmtHalfBath           -14.4839     13.975     -1.036      0.300     -41.904      12.936
FullBath                -1.2549      1.171     -1.072      0.284      -3.552       1.043
HalfBath                -0.4686      0.457     -1.024      0.306      -1.366       0.429
KitchenAbvGr            -6.8459      3.480     -1.967      0.049     -13.673      -0.018
TotRmsAbvGrd             2.2023      1.086      2.028      0.043       0.072       4.333
GarageCars               0.3146      0.071      4.431      0.000       0.175       0.454
PoolArea                19.2027     53.953      0.356      0.722     -86.657     125.062
MiscVal                  0.1482      0.058      2.566      0.010       0.035       0.262
TotalSF                  1.8341      0.727      2.523      0.012       0.408       3.260
Total_Bathrooms          1.3296      1.171      1.136      0.256      -0.967       3.626
LotArea_log              0.5327      0.068      7.813      0.000       0.399       0.666
1stFlrSF_log            88.9198    469.183      0.190      0.850    -831.657    1009.497
GrLivArea_log          -62.4745    252.639     -0.247      0.805    -558.173     433.224
BsmtHalfBath_log        21.3366     21.243      1.004      0.315     -20.344      63.017
KitchenAbvGr_log        12.5491      7.337      1.710      0.087      -1.847      26.945
TotRmsAbvGrd_log        -6.5930      3.075     -2.144      0.032     -12.626      -0.560
PoolArea_log          -112.2330    334.145     -0.336      0.737    -767.855     543.389
MiscVal_log             -0.5585      0.223     -2.500      0.013      -0.997      -0.120
YearRemodAdd_log         6.4577      0.481     13.435      0.000       5.515       7.401
TotalSF_log             60.9009     32.894      1.851      0.064      -3.640     125.442
1stFlrSF_log_sq        -35.6260    273.665     -0.130      0.896    -572.581     501.329
GrLivArea_log_sq        41.2794    117.851      0.350      0.726    -189.954     272.513
GarageCars_log_sq       -0.3203      0.105     -3.048      0.002      -0.526      -0.114
TotalSF_log_sq         -16.2051      7.911     -2.048      0.041     -31.727      -0.684
MSZoning_C (all)        -0.4662      0.044    -10.511      0.000      -0.553      -0.379
LandSlope_Sev           -0.1532      0.048     -3.198      0.001      -0.247      -0.059
Neighborhood_Crawfor     0.1672      0.022      7.758      0.000       0.125       0.209
Neighborhood_StoneBr     0.1611      0.031      5.250      0.000       0.101       0.221
Condition1_RRAe         -0.1532      0.048     -3.168      0.002      -0.248      -0.058
Condition2_RRAe         -0.3558      0.164     -2.169      0.030      -0.678      -0.034
Condition2_RRAn         -0.1783      0.136     -1.309      0.191      -0.446       0.089
RoofMatl_Membran         0.4383      0.143      3.064      0.002       0.158       0.719
RoofMatl_Metal           0.3500      0.142      2.458      0.014       0.071       0.629
Exterior1st_BrkComm     -0.3041      0.098     -3.089      0.002      -0.497      -0.111
Heating_Grav            -0.2424      0.057     -4.271      0.000      -0.354      -0.131
KitchenQual_Ex           0.1534      0.018      8.681      0.000       0.119       0.188
Functional_Sev          -0.4302      0.136     -3.158      0.002      -0.698      -0.163
Functional_Typ           0.1537      0.017      8.931      0.000       0.120       0.187
GarageType_2Types       -0.2417      0.062     -3.911      0.000      -0.363      -0.120
GarageQual_Ex            0.2437      0.080      3.043      0.002       0.087       0.401
GarageCond_Fa           -0.1686      0.027     -6.338      0.000      -0.221      -0.116
PoolQC_Ex              -38.0643    110.461     -0.345      0.730    -254.799     178.670
PoolQC_Gd              -37.8932    110.438     -0.343      0.732    -254.582     178.795
PoolQC__NA_            -97.6503    302.046     -0.323      0.747    -690.290     494.990
==============================================================================
Omnibus:                      162.415   Durbin-Watson:                   1.937
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              455.115
Skew:                          -0.723   Prob(JB):                     1.49e-99
Kurtosis:                       5.699   Cond. No.                     1.02e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.37e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [60]:
list(X_train.columns)
Out[60]:
['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'TotalSF',
 'TotalPorchSF',
 'HouseAge',
 'QualityIndex',
 'Total_Bathrooms',
 'Has_Fireplaces',
 'Has_Bsmt',
 'Has_Garage',
 'Has_Pool',
 'Has_2ndStory',
 'LotFrontage_log',
 'LotArea_log',
 'MasVnrArea_log',
 'BsmtFinSF1_log',
 'BsmtFinSF2_log',
 'BsmtUnfSF_log',
 'TotalBsmtSF_log',
 '1stFlrSF_log',
 '2ndFlrSF_log',
 'LowQualFinSF_log',
 'GrLivArea_log',
 'BsmtFullBath_log',
 'BsmtHalfBath_log',
 'FullBath_log',
 'HalfBath_log',
 'BedroomAbvGr_log',
 'KitchenAbvGr_log',
 'TotRmsAbvGrd_log',
 'Fireplaces_log',
 'GarageCars_log',
 'GarageArea_log',
 'WoodDeckSF_log',
 'OpenPorchSF_log',
 'EnclosedPorch_log',
 '3SsnPorch_log',
 'ScreenPorch_log',
 'PoolArea_log',
 'MiscVal_log',
 'YearRemodAdd_log',
 'TotalSF_log',
 'HouseAge_log',
 'YearRemodAdd_sq',
 'LotFrontage_log_sq',
 'TotalBsmtSF_log_sq',
 '1stFlrSF_log_sq',
 '2ndFlrSF_log_sq',
 'GrLivArea_log_sq',
 'GarageCars_log_sq',
 'GarageArea_log_sq',
 'TotalSF_log_sq',
 'HouseAge_log_sq',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'MSZoning__NA_',
 'Street_Grvl',
 'Street_Pave',
 'Alley_Grvl',
 'Alley_Pave',
 'Alley__NA_',
 'LotShape_IR1',
 'LotShape_IR2',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_Bnk',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'Utilities__NA_',
 'LotConfig_Corner',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LotConfig_Inside',
 'LandSlope_Gtl',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_IDOTRR',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighborhood_NPkVill',
 'Neighborhood_NWAmes',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_OldTown',
 'Neighborhood_SWISU',
 'Neighborhood_Sawyer',
 'Neighborhood_SawyerW',
 'Neighborhood_Somerst',
 'Neighborhood_StoneBr',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'Condition1_Artery',
 'Condition1_Feedr',
 'Condition1_Norm',
 'Condition1_PosA',
 'Condition1_PosN',
 'Condition1_RRAe',
 'Condition1_RRAn',
 'Condition1_RRNe',
 'Condition1_RRNn',
 'Condition2_Artery',
 'Condition2_Feedr',
 'Condition2_Norm',
 'Condition2_PosA',
 'Condition2_PosN',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'BldgType_1Fam',
 'BldgType_2fmCon',
 'BldgType_Duplex',
 'BldgType_Twnhs',
 'BldgType_TwnhsE',
 'HouseStyle_1.5Fin',
 'HouseStyle_1.5Unf',
 'HouseStyle_1Story',
 'HouseStyle_2.5Fin',
 'HouseStyle_2.5Unf',
 'HouseStyle_2Story',
 'HouseStyle_SFoyer',
 'HouseStyle_SLvl',
 'RoofStyle_Flat',
 'RoofStyle_Gable',
 'RoofStyle_Gambrel',
 'RoofStyle_Hip',
 'RoofStyle_Mansard',
 'RoofStyle_Shed',
 'RoofMatl_ClyTile',
 'RoofMatl_CompShg',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'RoofMatl_Tar&Grv',
 'RoofMatl_WdShake',
 'RoofMatl_WdShngl',
 'Exterior1st_AsbShng',
 'Exterior1st_AsphShn',
 'Exterior1st_BrkComm',
 'Exterior1st_BrkFace',
 'Exterior1st_CBlock',
 'Exterior1st_CemntBd',
 'Exterior1st_HdBoard',
 'Exterior1st_ImStucc',
 'Exterior1st_MetalSd',
 'Exterior1st_Plywood',
 'Exterior1st_Stone',
 'Exterior1st_Stucco',
 'Exterior1st_VinylSd',
 'Exterior1st_Wd Sdng',
 'Exterior1st_WdShing',
 'Exterior1st__NA_',
 'Exterior2nd_AsbShng',
 'Exterior2nd_AsphShn',
 'Exterior2nd_Brk Cmn',
 'Exterior2nd_BrkFace',
 'Exterior2nd_CBlock',
 'Exterior2nd_CmentBd',
 'Exterior2nd_HdBoard',
 'Exterior2nd_ImStucc',
 'Exterior2nd_MetalSd',
 'Exterior2nd_Other',
 'Exterior2nd_Plywood',
 'Exterior2nd_Stone',
 'Exterior2nd_Stucco',
 'Exterior2nd_VinylSd',
 'Exterior2nd_Wd Sdng',
 'Exterior2nd_Wd Shng',
 'Exterior2nd__NA_',
 'MasVnrType_BrkCmn',
 'MasVnrType_BrkFace',
 'MasVnrType_None',
 'MasVnrType_Stone',
 'MasVnrType__NA_',
 'ExterQual_Ex',
 'ExterQual_Fa',
 'ExterQual_Gd',
 'ExterQual_TA',
 'ExterCond_Ex',
 'ExterCond_Fa',
 'ExterCond_Gd',
 'ExterCond_Po',
 'ExterCond_TA',
 'Foundation_BrkTil',
 'Foundation_CBlock',
 'Foundation_PConc',
 'Foundation_Slab',
 'Foundation_Stone',
 'Foundation_Wood',
 'BsmtQual_Ex',
 'BsmtQual_Fa',
 'BsmtQual_Gd',
 'BsmtQual_TA',
 'BsmtQual__NA_',
 'BsmtCond_Fa',
 'BsmtCond_Gd',
 'BsmtCond_Po',
 'BsmtCond_TA',
 'BsmtCond__NA_',
 'BsmtExposure_Av',
 'BsmtExposure_Gd',
 'BsmtExposure_Mn',
 'BsmtExposure_No',
 'BsmtExposure__NA_',
 'BsmtFinType1_ALQ',
 'BsmtFinType1_BLQ',
 'BsmtFinType1_GLQ',
 'BsmtFinType1_LwQ',
 'BsmtFinType1_Rec',
 'BsmtFinType1_Unf',
 'BsmtFinType1__NA_',
 'BsmtFinType2_ALQ',
 'BsmtFinType2_BLQ',
 'BsmtFinType2_GLQ',
 'BsmtFinType2_LwQ',
 'BsmtFinType2_Rec',
 'BsmtFinType2_Unf',
 'BsmtFinType2__NA_',
 'Heating_Floor',
 'Heating_GasA',
 'Heating_GasW',
 'Heating_Grav',
 'Heating_OthW',
 'Heating_Wall',
 'HeatingQC_Ex',
 'HeatingQC_Fa',
 'HeatingQC_Gd',
 'HeatingQC_Po',
 'HeatingQC_TA',
 'CentralAir_N',
 'CentralAir_Y',
 'Electrical_FuseA',
 'Electrical_FuseF',
 'Electrical_FuseP',
 'Electrical_Mix',
 'Electrical_SBrkr',
 'Electrical__NA_',
 'KitchenQual_Ex',
 'KitchenQual_Fa',
 'KitchenQual_Gd',
 'KitchenQual_TA',
 'KitchenQual__NA_',
 'Functional_Maj1',
 'Functional_Maj2',
 'Functional_Min1',
 'Functional_Min2',
 'Functional_Mod',
 'Functional_Sev',
 'Functional_Typ',
 'Functional__NA_',
 'FireplaceQu_Ex',
 'FireplaceQu_Fa',
 'FireplaceQu_Gd',
 'FireplaceQu_Po',
 'FireplaceQu_TA',
 'FireplaceQu__NA_',
 'GarageType_2Types',
 'GarageType_Attchd',
 'GarageType_Basment',
 'GarageType_BuiltIn',
 'GarageType_CarPort',
 'GarageType_Detchd',
 'GarageType__NA_',
 'GarageFinish_Fin',
 'GarageFinish_RFn',
 'GarageFinish_Unf',
 'GarageFinish__NA_',
 'GarageQual_Ex',
 'GarageQual_Fa',
 'GarageQual_Gd',
 'GarageQual_Po',
 'GarageQual_TA',
 'GarageQual__NA_',
 'GarageCond_Ex',
 'GarageCond_Fa',
 'GarageCond_Gd',
 'GarageCond_Po',
 'GarageCond_TA',
 'GarageCond__NA_',
 'PavedDrive_N',
 'PavedDrive_P',
 'PavedDrive_Y',
 'PoolQC_Ex',
 'PoolQC_Fa',
 'PoolQC_Gd',
 'PoolQC__NA_',
 'Fence_GdPrv',
 'Fence_GdWo',
 'Fence_MnPrv',
 'Fence_MnWw',
 'Fence__NA_',
 'MiscFeature_Gar2',
 'MiscFeature_Othr',
 'MiscFeature_Shed',
 'MiscFeature_TenC',
 'MiscFeature__NA_',
 'SaleType_COD',
 'SaleType_CWD',
 'SaleType_Con',
 'SaleType_ConLD',
 'SaleType_ConLI',
 'SaleType_ConLw',
 'SaleType_New',
 'SaleType_Oth',
 'SaleType_WD',
 'SaleType__NA_',
 'SaleCondition_Abnorml',
 'SaleCondition_AdjLand',
 'SaleCondition_Alloca',
 'SaleCondition_Family',
 'SaleCondition_Normal',
 'SaleCondition_Partial']

Figure 7.3: Handpicked Linear Model

In [61]:
select_vars0 = ['TotRmsAbvGrd','TotalSF','GrLivArea','GarageCars']
select_vars1 = ['LotArea_log','TotalSF_log','GarageCars_log','HouseAge_log']
select_vars2 = ['TotalSF_log_sq','HouseAge_log_sq']
oth_var = ['KitchenAbvGr','Total_Bathrooms','PoolArea', 'QualityIndex','Fireplaces']
oth_ind = ['Has_Bsmt', 'Has_Garage', 'Has_Pool', 'Has_2ndStory',
          'Neighborhood_BrkSide','Neighborhood_Crawfor',  'Neighborhood_NoRidge',  'Neighborhood_NridgHt','Neighborhood_StoneBr',
           'Id']

features = select_vars1+select_vars2+oth_var+oth_ind
# LinearRegression

X_train0 = X_train[features]

plt.figure(figsize=(20,20))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(X_train[select_vars1+oth_var].corr(), annot=True,cmap='RdYlGn',square=True) 

linear3 = LinearRegression().fit(X_train0, y_train)
score = cv_rmse(linear,X_train0, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linear'] = (score.mean(), score.std())
# Make predictions using the testing set
x_test_linear = pd.DataFrame(linear3.predict(X_test[features]),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.11475766913083842
linear: 0.1297 (0.0083)
0.11485359543660614

Figure 7.4: Multicollinearity

In [62]:
## creating function to get model statistics
def get_stats():
    results = sm.OLS(y_train, X_train0).fit()
    print(results.summary())
get_stats()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        Log1p_SalePrice   R-squared:                       0.902
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     618.0
Date:                Thu, 07 Oct 2021   Prob (F-statistic):               0.00
Time:                        15:39:51   Log-Likelihood:                 754.57
No. Observations:                1165   AIC:                            -1473.
Df Residuals:                    1147   BIC:                            -1382.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
LotArea_log              0.6646      0.058     11.551      0.000       0.552       0.777
TotalSF_log             -9.4823      1.822     -5.205      0.000     -13.057      -5.908
GarageCars_log           0.1293      0.016      8.098      0.000       0.098       0.161
HouseAge_log             0.0972      0.035      2.812      0.005       0.029       0.165
TotalSF_log_sq           1.7770      0.293      6.068      0.000       1.202       2.352
HouseAge_log_sq         -0.0632      0.009     -7.019      0.000      -0.081      -0.046
KitchenAbvGr            -0.3245      0.082     -3.977      0.000      -0.485      -0.164
Total_Bathrooms          0.0630      0.007      9.061      0.000       0.049       0.077
PoolArea                 0.0499      0.017      2.992      0.003       0.017       0.083
QualityIndex             0.0110      0.000     22.791      0.000       0.010       0.012
Fireplaces               0.0610      0.011      5.596      0.000       0.040       0.082
Has_Bsmt                 5.5358      0.717      7.722      0.000       4.129       6.942
Has_Garage               5.5358      0.717      7.722      0.000       4.129       6.942
Has_Pool                 5.5358      0.717      7.722      0.000       4.129       6.942
Has_2ndStory             5.5358      0.717      7.722      0.000       4.129       6.942
Neighborhood_BrkSide     0.0649      0.020      3.222      0.001       0.025       0.104
Neighborhood_Crawfor     0.1436      0.021      6.853      0.000       0.103       0.185
Neighborhood_NoRidge     0.0947      0.025      3.751      0.000       0.045       0.144
Neighborhood_NridgHt     0.0916      0.019      4.698      0.000       0.053       0.130
Neighborhood_StoneBr     0.1875      0.029      6.419      0.000       0.130       0.245
Id                   -4.049e-06   8.95e-06     -0.452      0.651   -2.16e-05    1.35e-05
==============================================================================
Omnibus:                      389.427   Durbin-Watson:                   1.951
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2307.541
Skew:                          -1.415   Prob(JB):                         0.00
Kurtosis:                       9.287   Cond. No.                     1.84e+20
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.46e-32. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [63]:
y_pred = x_test_linear['Log1p_SalePrice']
residuals = y_test['Log1p_SalePrice']- y_pred
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=y_pred)))

# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))

p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')

p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
-0.019511906803234558
R squared: 0.9101395703738636
In [64]:
df_test_linear = pd.DataFrame()
print(df_test.Id.head())

df_test_linear = pd.DataFrame(linear3.predict(df_test[features]),columns=['Log1p_SalePrice'])
print(df_test_linear.head())
df_test_linear['SalePrice'] = np.expm1(df_test_linear['Log1p_SalePrice'])
df_test_linear['Id'] = df_test['Id'].values
df_test_linear[['Id', 'SalePrice']].to_csv('LinearRegression_SalePrice.csv', index=False)
print(df_test_linear.head())
#files.download('LinearRegression_SalePrice.csv') #0.14259
1460    1461
1461    1462
1462    1463
1463    1464
1464    1465
Name: Id, dtype: int64
   Log1p_SalePrice
0           11.628
1           11.915
2           12.087
3           12.178
4           12.238
   Log1p_SalePrice  SalePrice    Id
0           11.628 112243.255  1461
1           11.915 149536.754  1462
2           12.087 177470.955  1463
3           12.178 194412.206  1464
4           12.238 206512.954  1465
In [65]:
print(df_test_linear['SalePrice'].describe())
print(df_test_junk['SalePrice'].describe())
count     1459.000
mean    177235.729
std      75692.316
min      55795.163
25%     124108.328
50%     157447.408
75%     210526.526
max     783028.818
Name: SalePrice, dtype: float64
count     1459.000
mean    181584.037
std      69823.290
min      50150.000
25%     133523.348
50%     161603.035
75%     207608.613
max     471865.062
Name: SalePrice, dtype: float64

linearRegression.png

Figure 9.2: Handpicked Linear Model in Kaggle

Principal Component Analysis

In [66]:
# Standardizing the features
npca = 50
X_trainPCA = StandardScaler().fit_transform(X_train)
pca = PCA(n_components=npca)
pca_train = pca.fit_transform(X_trainPCA)
pca_trainDF = pd.DataFrame(data = pca_train, columns = ["col"+str(i) for i in range(1, npca+1)])

X_testPCA = StandardScaler().fit_transform(X_test)
pca_test = pca.transform(X_testPCA)
pca_testDF = pd.DataFrame(data = pca_test, columns = ["col"+str(i) for i in range(1, npca+1)])

pca.explained_variance_ratio_

# LinearRegression
names=pd.DataFrame(X_train.columns)
linearPCA = LinearRegression().fit(pca_trainDF, y_train)
score = cv_rmse(linearPCA,pca_trainDF, y_train)
print("linear: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['linearPCA'] = (score.mean(), score.std())
print(pca.explained_variance_ratio_)

print(pca_trainDF.head())
print(pca_testDF.head())
linear: 0.1255 (0.0103)
[0.08835163 0.03419551 0.03257189 0.0274681  0.02401261 0.02154691
 0.01591434 0.01524315 0.01386538 0.01322695 0.01266914 0.01230974
 0.01153875 0.01138356 0.01044695 0.00988861 0.00962357 0.00918755
 0.00903131 0.00890427 0.00873903 0.00838954 0.00806001 0.00784575
 0.00774812 0.00765698 0.00748594 0.00741839 0.00718533 0.00700956
 0.00688341 0.00677867 0.00659874 0.00651504 0.00638706 0.00628656
 0.00620162 0.0059897  0.00592179 0.00583144 0.00575765 0.00567489
 0.00562064 0.00558017 0.00548771 0.0053867  0.00530686 0.00514289
 0.00502012 0.00498828]
    col1   col2   col3   col4   col5  ...  col46  col47  col48  col49  col50
0 -4.140  1.302  7.049 -1.774  0.183  ...  0.392  0.799 -0.151 -0.290 -0.592
1  8.217  3.486 -1.083 -1.849 -0.532  ... -1.012 -0.695 -2.806  1.474 -0.102
2  4.812 -1.324  0.524 -2.359 -1.240  ...  0.872  0.098 -0.722  0.096  0.064
3 -5.172 -1.638  4.994 -1.375  0.890  ... -0.319 -1.028  0.969  1.290  0.858
4  7.226  2.302 -0.831 -1.620 -3.849  ...  0.588  4.106  0.622 -1.070 -1.029

[5 rows x 50 columns]
    col1   col2   col3   col4   col5  ...  col46  col47  col48  col49  col50
0  3.768 -4.498 -0.694 -1.692 -0.539  ...  0.012 -0.876 -1.068 -0.514 -0.560
1  6.936  6.599 -0.857 -1.095 -1.684  ... -0.121 -0.281 -1.207  0.328  0.699
2  3.812 -4.528  1.118  3.146  2.320  ... -4.042  4.017 -2.822  1.927 -0.460
3  3.890 -5.475 -0.343  1.021  0.025  ... -1.707 -1.288 -0.815 -0.149  1.816
4 -4.194  4.698  1.791  0.629 -2.085  ... -0.720 -0.178  0.253  0.314 -0.051

[5 rows x 50 columns]
In [67]:
# Make predictions using the testing set
x_test_linear = pd.DataFrame()
print(pd.DataFrame(linearPCA.predict(pca_testDF),columns=['Log1p_SalePrice']).head())
x_test_linear = pd.DataFrame(linearPCA.predict(pca_testDF),columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_linear['Log1p_SalePrice']))) #0.1107754992636765
   Log1p_SalePrice
0           11.801
1           11.556
2           11.564
3           11.824
4           12.177
0.11198100538529619
In [68]:
df_test_linearPCA = pd.DataFrame()
print(df_test.Id.head())

df_testPCA = StandardScaler().fit_transform(df_test)
pca_df_test = pca.transform(df_testPCA)
pca_df_testDF = pd.DataFrame(data = pca_df_test, columns = ["col"+str(i) for i in range(1, npca+1)])

df_test_linearPCA = pd.DataFrame(linearPCA.predict(pca_df_testDF),columns=['Log1p_SalePrice'])
print(df_test_linearPCA.head())
df_test_linearPCA['SalePrice'] = np.expm1(df_test_linearPCA['Log1p_SalePrice'])
df_test_linearPCA['Id'] = df_test['Id'].values
df_test_linearPCA[['Id', 'SalePrice']].to_csv('LinearRegressionPCA_SalePrice.csv', index=False)
print(df_test_linearPCA.head())
#files.download('LinearRegressionPCA_SalePrice.csv') #.13759
1460    1461
1461    1462
1462    1463
1463    1464
1464    1465
Name: Id, dtype: int64
   Log1p_SalePrice
0           11.626
1           12.047
2           12.134
3           12.272
4           12.188
   Log1p_SalePrice  SalePrice    Id
0           11.626 111994.394  1461
1           12.047 170633.255  1462
2           12.134 186040.808  1463
3           12.272 213587.118  1464
4           12.188 196483.486  1465
In [69]:
print(df_test_junk['SalePrice'].describe())
print(df_test_linear['SalePrice'].describe())
print(df_test_linearPCA['SalePrice'].describe())
count     1459.000
mean    181584.037
std      69823.290
min      50150.000
25%     133523.348
50%     161603.035
75%     207608.613
max     471865.062
Name: SalePrice, dtype: float64
count     1459.000
mean    177235.729
std      75692.316
min      55795.163
25%     124108.328
50%     157447.408
75%     210526.526
max     783028.818
Name: SalePrice, dtype: float64
count     1459.000
mean    180082.548
std      77896.748
min      48679.514
25%     126645.363
50%     159273.332
75%     212119.633
max     761071.329
Name: SalePrice, dtype: float64

Figure 8.1 Support Vector Regressor, KRR, Ridge and Lasso

In [70]:
# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# KRR
krr = make_pipeline(RobustScaler(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))

# Ridge Regressor
ridge_alphas = [9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Lasso Regressor
lasso_alphas = arange (0.0001, 0.002, 0.0001)
lasso = make_pipeline(RobustScaler(), LassoCV(alphas =lasso_alphas, cv=kf))
In [71]:
score = cv_rmse(svr,X_train, y_train)
print("svr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())
svr: 0.1135 (0.0073)
In [72]:
svr =svr.fit(X_train, y_train)
x_test_svr = pd.DataFrame()
svr_test = svr.predict(X_test)
x_test_svr['Log1p_SalePrice'] = svr_test
print(np.sqrt(mean_squared_error(y_test, x_test_svr['Log1p_SalePrice'])))
0.10101178077230695
In [179]:
residuals = y_test['Log1p_SalePrice']- x_test_svr['Log1p_SalePrice']
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=x_test_svr['Log1p_SalePrice'])))

# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = x_test_svr['Log1p_SalePrice']
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')

p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
-0.037538587136317095
R squared: 0.9304938304523374
In [73]:
df_test.head()
Out[73]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold TotalSF TotalPorchSF HouseAge ... GarageQual__NA_ GarageCond_Ex GarageCond_Fa GarageCond_Gd GarageCond_Po GarageCond_TA GarageCond__NA_ PavedDrive_N PavedDrive_P PavedDrive_Y PoolQC_Ex PoolQC_Fa PoolQC_Gd PoolQC__NA_ Fence_GdPrv Fence_GdWo Fence_MnPrv Fence_MnWw Fence__NA_ MiscFeature_Gar2 MiscFeature_Othr MiscFeature_Shed MiscFeature_TenC MiscFeature__NA_ SaleType_COD SaleType_CWD SaleType_Con SaleType_ConLD SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleType__NA_ SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
1460 1461 4.237 80.000 14.096 5 4.986 1961 1961 0.799 88.256 7.074 41.903 301.226 5.330 1.012 0.660 6.607 0.987 0.597 1 1.109 2 0.992 1.728 0.854 1961.000 1.000 730.000 33.456 0.805 0.769 0.697 18.083 0.676 0.691 6 2010 20.666 45.645 14.353 ... 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
1461 1462 4.237 81.000 14.542 6 4.986 1958 1958 13.911 139.254 0.726 53.358 420.280 5.563 1.012 0.660 6.977 0.987 0.597 1 2.382 3 0.992 1.832 0.854 1958.000 1.000 312.000 64.644 8.595 0.769 0.697 0.819 0.676 8.960 6 2010 23.153 62.150 14.878 ... 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
1462 1463 6.329 74.000 14.474 5 4.386 1997 1998 0.799 125.565 0.726 27.931 313.930 5.351 834.585 0.660 7.168 0.987 0.597 2 2.382 3 0.992 1.832 1.538 1997.000 2.000 482.000 43.645 8.348 0.769 0.697 0.819 0.676 0.691 3 2010 22.903 44.109 6.349 ... 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
1463 1464 6.329 78.000 13.769 6 4.986 1998 1998 6.115 104.537 0.726 46.691 313.380 5.349 806.415 0.660 7.154 0.987 0.597 2 2.382 3 0.992 1.922 1.538 1998.000 2.000 470.000 61.145 8.595 0.769 0.697 0.819 0.676 0.691 6 2010 22.835 59.166 6.041 ... 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
1464 1465 7.942 43.000 12.339 8 4.386 1992 1992 0.799 59.833 0.726 91.507 407.650 5.541 1.012 0.660 6.942 0.987 0.597 2 1.109 2 0.992 1.728 0.854 1992.000 2.000 506.000 0.867 12.936 0.769 0.697 19.855 0.676 0.691 1 2010 22.911 41.852 7.770 ... 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0

5 rows × 363 columns

In [74]:
df_test_svr = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_svr['Log1p_SalePrice'] = svr.predict(df_test_svr)
df_test_svr['SalePrice'] = np.expm1(df_test_svr['Log1p_SalePrice'])
df_test_svr[['Id', 'SalePrice']].to_csv('SVR_SalePrice.csv', index=False)
#files.download('SVR_SalePrice.csv') #0.12749

SVR.png

Figure 9.3: SVR in Kaggle

In [76]:
score = cv_rmse(krr,X_train, y_train)
print("krr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['krr'] = (score.mean(), score.std())
krr: 0.1173 (0.0064)
In [77]:
krr =krr.fit(X_train, y_train)
krr_test = krr.predict(X_test)
x_test_krr = pd.DataFrame(krr_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_krr['Log1p_SalePrice'])))
0.1026971253108915

Figure 8.2: Ridge Model

In [78]:
score = cv_rmse(ridge,X_train, y_train)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())
ridge: 0.1149 (0.0071)
In [79]:
ridge =ridge.fit(X_train, y_train)
ridge_test = ridge.predict(X_test)
x_test_ridge = pd.DataFrame(ridge_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_ridge['Log1p_SalePrice'])))
0.1012346361357768
In [180]:
residuals = y_test['Log1p_SalePrice']- x_test_ridge['Log1p_SalePrice']
mean_residuals = np.mean(residuals)
print(mean_residuals)
print("R squared: {}".format(r2_score(y_true=y_test['Log1p_SalePrice'],y_pred=x_test_ridge['Log1p_SalePrice'])))

# Detecting heteroscedasticity
fig = plt.figure(figsize = (20,20))
y_pred = x_test_ridge['Log1p_SalePrice']
p1 = sns.scatterplot(y_pred,residuals,ax=fig.add_subplot(2,2,1))
plt.xlabel('predicted values')
plt.ylabel('Residuals')
p1 = plt.title('Residuals vs fitted values plot for homoscedasticity check')

p2 = sns.distplot(residuals,kde=True,ax=fig.add_subplot(2,2,2))
p2 = plt.title('Normality of error terms/residuals')
-0.02266520877681812
R squared: 0.9301867987452228
In [156]:
df_test_ridge = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_ridge['Log1p_SalePrice'] = ridge.predict(df_test_ridge)
df_test_ridge['SalePrice'] = np.expm1(df_test_ridge['Log1p_SalePrice'])
df_test_ridge[['Id', 'SalePrice']].to_csv('Ridge_SalePrice.csv', index=False)
#files.download('Ridge_SalePrice.csv') #0.12431

ridge.png

Figure 9.4: Ridge Model in Kaggle

Figure 8.3: Lasso Model

In [80]:
score = cv_rmse(lasso,X_train, y_train)
print("lasso: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lasso'] = (score.mean(), score.std())
lasso: 0.1130 (0.0073)
In [81]:
lasso =lasso.fit(X_train, y_train)
lasso_test = lasso.predict(X_test)
x_test_lasso = pd.DataFrame(lasso_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_lasso['Log1p_SalePrice'])))
0.10010510850991694
In [157]:
df_test_lasso = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_lasso['Log1p_SalePrice'] = lasso.predict(df_test_lasso)
df_test_lasso['SalePrice'] = np.expm1(df_test_lasso['Log1p_SalePrice'])
df_test_lasso[['Id', 'SalePrice']].to_csv('Lasso_SalePrice.csv', index=False)
#files.download('Lasso_SalePrice.csv') #0.12749

lasso.png

Figure 9.5: Lasso Model in Kaggle

Elastic Net Regression

In [82]:
alphas = arange (0.0001, 0.002, 0.0001)
len(alphas)
Out[82]:
19
In [83]:
# Definning models
model_ridge = Ridge()
model_lasso = Lasso()
model_elasticNet = ElasticNet(l1_ratio=0.99)

# Tunning hyperparameter
# In this example we use alpha as our hyperparameter lambda. 

ridge_alphas = list(arange (0.0001, 0.002, 0.0001)) + [0.1, 0.3, 1, 3, 5, 10, 15]
lasso_alphas = arange (0.0001, 0.002, 0.0001)

# Evaluating models through k-fold cross-validation defined earlier
cv_ridge = [cv_rmse(Ridge(alpha = alpha),X_train, y_train).mean() 
            for alpha in ridge_alphas]

cv_lasso = [cv_rmse(Lasso(alpha = alpha),X_train, y_train).mean() 
            for alpha in lasso_alphas]

cv_elasticNet = [cv_rmse(ElasticNet(alpha = alpha),X_train, y_train).mean() 
            for alpha in lasso_alphas]

cv_ridge = pd.Series(cv_ridge, index = ridge_alphas)
cv_lasso = pd.Series(cv_lasso, index = lasso_alphas)
cv_elasticNet = pd.Series(cv_elasticNet, index = lasso_alphas)

# In this example we use ratios as our hyperparameter l1_ratio. 
ratios = arange(0, 1, 0.01)

cv_elasticNet2 = [cv_rmse(ElasticNet(l1_ratio = ratio),X_train, y_train).mean() 
            for ratio in ratios]
In [84]:
plt.figure(figsize=(20,10))

plt.subplot(251)
cv_ridge.plot(title = "Ridge Validation Curve for alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")

cv_lasso = pd.Series(cv_lasso, index = lasso_alphas)
plt.subplot(252)
cv_lasso.plot(title = "Lasso Validation Curve")
plt.xlabel("alpha")
plt.ylabel("rmse")

cv_elasticNet = pd.Series(cv_elasticNet, index = lasso_alphas)
plt.subplot(253)
cv_elasticNet.plot(title = "ElasticNet Validation Curve for alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")

plt.subplot(254)
cv_ridge.plot().set_xlim([0, 0.002])
cv_lasso.plot()
cv_elasticNet.plot()
plt.legend(labels=['Ridge','Lasso','ElasticNet'])
plt.title('Models Validation Curves for alpha')
plt.xlabel("alpha")
plt.ylabel("rmse")

cv_elasticNet2 = pd.Series(cv_elasticNet2, index = ratios)
plt.subplot(255)
cv_elasticNet2.plot(title = "ElasticNet Validation Curve for ratio")
plt.xlabel("ratio")
plt.ylabel("rmse")

plt.tight_layout()

Figure 8.1: hyperparameter tuning for Ridge, Lasso, and Elastic Net

In [85]:
ridgeCV = RidgeCV(alphas=alphas, cv=kf)
ridgeCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % ridgeCV.alpha_)
alpha: 0.001900
In [86]:
lassoCV = LassoCV(alphas=alphas, cv=kf)
lassoCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % lassoCV.alpha_)
alpha: 0.000700

Figure 8.4: Elastic Net Regression

In [87]:
elasticNetCV = ElasticNetCV(l1_ratio=ratios, alphas=lasso_alphas, cv=kf)
elasticNetCV.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % elasticNetCV.alpha_)
print('l1_ratio_: %f' % elasticNetCV.l1_ratio_)
alpha: 0.000700
l1_ratio_: 0.990000
In [106]:
enet =ElasticNet(l1_ratio=.99, alpha=0.00070,random_state=321)
score = cv_rmse(enet,X_train, y_train)
print("ElasticNet: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['enet'] = (score.mean(), score.std())
ElasticNet: 0.1135 (0.0072)
In [ ]:
# Elastic Net Regression
#enet = make_pipeline(RobustScaler(), ElasticNetCV(l1_ratio=.99,alphas =alphas, cv=kf))
#score = cv_rmse(enet,X_train, y_train)
#print("ElasticNet: {:.4f} ({:.4f})".format(score.mean(), score.std()))
#scores['enet'] = (score.mean(), score.std())
In [107]:
enet =enet.fit(X_train, y_train)
In [108]:
enet_test = enet.predict(X_test)
x_test_enet = pd.DataFrame(enet_test, columns=['Log1p_SalePrice'])
print(np.sqrt(mean_squared_error(y_test, x_test_enet['Log1p_SalePrice'])))
0.10067882225714392
In [109]:
# Plot the predictions for each model
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()
In [158]:
df_test_enet = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
df_test_enet['Log1p_SalePrice'] = enet.predict(df_test_enet)
df_test_enet['SalePrice'] = np.expm1(df_test_enet['Log1p_SalePrice'])
df_test_enet[['Id', 'SalePrice']].to_csv('ENET_SalePrice.csv', index=False)
#files.download('ENET_SalePrice.csv') #0.12749

enet.png

Figure 9.6: Elastic Net Model in Kaggle

In [173]:
# Fix outleir predictions
submission = df_test[df_test.columns.drop(list(df_test.filter(regex='SalePrice')))]
submission['SalePrice'] = 1/4*(df_test_svr['SalePrice'] + df_test_ridge['SalePrice']+df_test_lasso['SalePrice']+df_test_enet['SalePrice'])
q1 = submission['SalePrice'].quantile(0.0045)
q2 = submission['SalePrice'].quantile(0.99)
print(q1, q2)
print(submission.head())
print(submission[['Id', 'SalePrice']].describe())
submission[['Id', 'SalePrice']].to_csv('Blended_SalePrice.csv', index=False)
#files.download('Blended_SalePrice.csv')
55539.98957354686 441209.3054573772
        Id  MSSubClass  ...  SaleCondition_Partial  SalePrice
1460  1461       4.237  ...                      0 117116.956
1461  1462       4.237  ...                      0 159975.587
1462  1463       6.329  ...                      0 186473.635
1463  1464       6.329  ...                      0 199131.251
1464  1465       7.942  ...                      0 192625.824

[5 rows x 364 columns]
            Id  SalePrice
count 1459.000   1459.000
mean  2190.000 178512.124
std    421.321  79226.242
min   1461.000  46786.611
25%   1825.500 126381.127
50%   2190.000 157376.778
75%   2554.500 210866.448
max   2919.000 941656.668
In [174]:
print(df_test_ridge['SalePrice'].describe())
count     1459.000
mean    177679.858
std      79002.638
min      47719.606
25%     125334.824
50%     156520.270
75%     210771.470
max     888071.479
Name: SalePrice, dtype: float64
In [169]:
# Fix outleir predictions
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.71)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.18)
print(submission[['Id', 'SalePrice']].describe())
            Id   SalePrice
count 1459.000    1459.000
mean  2190.000  179406.598
std    421.321   84316.069
min   1461.000   33218.494
25%   1825.500  126381.127
50%   2190.000  157376.778
75%   2554.500  210866.448
max   2919.000 1111154.868
In [175]:
#Scale predictions
submission['SalePrice'] = submission['SalePrice']/1.005472
print(submission[['Id', 'SalePrice']].describe())
submission[['Id', 'SalePrice']].to_csv('Blended2_SalePrice.csv', index=False)
#files.download('Blended2_SalePrice.csv')
            Id  SalePrice
count 1459.000   1459.000
mean  2190.000 177540.622
std    421.321  78795.075
min   1461.000  46531.988
25%   1825.500 125693.333
50%   2190.000 156520.299
75%   2554.500 209718.867
max   2919.000 936531.965
In [ ]:
print(df_test_junk['SalePrice'].describe())
print(df_test_linear['SalePrice'].describe())
print(df_test_linearPCA['SalePrice'].describe())
print(df_test_svr['SalePrice'].describe())
print(df_test_ridge['SalePrice'].describe())
print(df_test_lasso['SalePrice'].describe())
print(df_test_enet['SalePrice'].describe())